LLVM 19.1.0
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
68
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
193 setOperationAction(ISD::LOAD,
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
200 setOperationAction(ISD::STORE,
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
210 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
211 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
212 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
213 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
214 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
215 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
216 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
228 setOperationAction(ISD::FABS, MVT::bf16, Legal);
229 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
231
232 // We only need to custom lower because we can't specify an action for bf16
233 // sources.
236 }
237
238 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
239 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
240 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
241 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
242 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
243 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
244 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
245 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
246 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
247 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
248 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
249 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
250 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
251 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
252 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
253 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
254
255 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
256 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
257 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
261 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
262
263 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
264
268 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
269
270 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
271
273 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
274
276 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
277 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
278
280 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
283 Expand);
285 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
288 Expand);
289
291 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292 MVT::v3i16, MVT::v4i16, MVT::Other},
293 Custom);
294
295 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
296 setOperationAction(ISD::BR_CC,
297 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
298
300
302
304 Expand);
305
306#if 0
308#endif
309
310 // We only support LOAD/STORE and vector manipulation ops for vectors
311 // with > 4 elements.
312 for (MVT VT :
313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
321 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
322 switch (Op) {
323 case ISD::LOAD:
324 case ISD::STORE:
326 case ISD::BITCAST:
327 case ISD::UNDEF:
331 case ISD::IS_FPCLASS:
332 break;
337 break;
338 default:
340 break;
341 }
342 }
343 }
344
345 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
346
347 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
348 // is expanded to avoid having two separate loops in case the index is a VGPR.
349
350 // Most operations are naturally 32-bit vector operations. We only support
351 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
352 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
354 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
355
357 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
358
360 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
361
363 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
364 }
365
366 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
369
371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
372
374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
375
377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
378 }
379
380 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
382 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
383
385 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
386
388 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
389
391 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
392 }
393
394 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
396 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
397
399 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
400
402 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
403
405 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
406 }
407
408 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
410 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
411
413 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
414
416 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
417
419 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
420 }
421
423 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
424 Expand);
425
426 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
427 Custom);
428
429 // Avoid stack access for these.
430 // TODO: Generalize to more vector types.
432 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
434 Custom);
435
436 // Deal with vec3 vector operations when widened to vec4.
438 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
439
440 // Deal with vec5/6/7 vector operations when widened to vec8.
442 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
446 Custom);
447
448 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
449 // and output demarshalling
450 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
451
452 // We can't return success/failure, only the old value,
453 // let LLVM add the comparison
454 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
455 Expand);
456
457 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
458
459 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
460
461 // FIXME: This should be narrowed to i32, but that only happens if i64 is
462 // illegal.
463 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
464 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
465
466 // On SI this is s_memtime and s_memrealtime on VI.
467 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
468
469 if (Subtarget->hasSMemRealTime() ||
471 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
472 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
473
474 if (Subtarget->has16BitInsts()) {
475 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
476 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
477 } else {
478 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
479 }
480
481 if (Subtarget->hasMadMacF32Insts())
483
484 if (!Subtarget->hasBFI())
485 // fcopysign can be done in a single instruction with BFI.
486 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
487
488 if (!Subtarget->hasBCNT(32))
490
491 if (!Subtarget->hasBCNT(64))
493
494 if (Subtarget->hasFFBH())
496
497 if (Subtarget->hasFFBL())
499
500 // We only really have 32-bit BFE instructions (and 16-bit on VI).
501 //
502 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
503 // effort to match them now. We want this to be false for i64 cases when the
504 // extraction isn't restricted to the upper or lower half. Ideally we would
505 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
506 // span the midpoint are probably relatively rare, so don't worry about them
507 // for now.
508 if (Subtarget->hasBFE())
510
511 // Clamp modifier on add/sub
512 if (Subtarget->hasIntClamp())
514
515 if (Subtarget->hasAddNoCarry())
516 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
517 Legal);
518
519 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
520 Custom);
521
522 // These are really only legal for ieee_mode functions. We should be avoiding
523 // them for functions that don't have ieee_mode enabled, so just say they are
524 // legal.
525 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
526 {MVT::f32, MVT::f64}, Legal);
527
528 if (Subtarget->haveRoundOpsF64())
529 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
530 Legal);
531 else
532 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
533 MVT::f64, Custom);
534
535 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
536 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
537 Legal);
538 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
539
540 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
542
543 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
545
546 // Custom lower these because we can't specify a rule based on an illegal
547 // source bf16.
548 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
549 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
550
551 if (Subtarget->has16BitInsts()) {
554 MVT::i16, Legal);
555
556 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
557
559 MVT::i16, Expand);
560
564 ISD::CTPOP},
565 MVT::i16, Promote);
566
567 setOperationAction(ISD::LOAD, MVT::i16, Custom);
568
569 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
570
571 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
572 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
573 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
574 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
575
579
581
582 // F16 - Constant Actions.
585
586 // F16 - Load/Store Actions.
587 setOperationAction(ISD::LOAD, MVT::f16, Promote);
588 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
589 setOperationAction(ISD::STORE, MVT::f16, Promote);
590 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
591
592 // BF16 - Load/Store Actions.
593 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
594 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
595 setOperationAction(ISD::STORE, MVT::bf16, Promote);
596 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
597
598 // F16 - VOP1 Actions.
600 ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
601 MVT::f16, Custom);
602
605
606 // F16 - VOP2 Actions.
607 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
608 Expand);
609 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
610 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
612
613 // F16 - VOP3 Actions.
615 if (STI.hasMadF16())
617
618 for (MVT VT :
619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
622 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
623 switch (Op) {
624 case ISD::LOAD:
625 case ISD::STORE:
627 case ISD::BITCAST:
628 case ISD::UNDEF:
634 case ISD::IS_FPCLASS:
635 break;
638 break;
639 default:
641 break;
642 }
643 }
644 }
645
646 // v_perm_b32 can handle either of these.
647 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
649
650 // XXX - Do these do anything? Vector constants turn into build_vector.
651 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
652
653 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
654 Legal);
655
656 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
657 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
658 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
659 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
660
661 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
662 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
663 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
664 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
665
666 setOperationAction(ISD::AND, MVT::v2i16, Promote);
667 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
668 setOperationAction(ISD::OR, MVT::v2i16, Promote);
669 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
670 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
672
673 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
674 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
675 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
676 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
677 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
678 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
679
680 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
681 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
682 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
683 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
684 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
685 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
686
687 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
688 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
689 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
690 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
691 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
692 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
693
694 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
695 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
696 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
697 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
698
699 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
700 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
701 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
702 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
703 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
704 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
705
706 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
708 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
710 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
712
713 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
714 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
715 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
716 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
717 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
718 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
719
720 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
722 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
724 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
726
727 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
728 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
729 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
730 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
731 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
732 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
733
735 MVT::v2i32, Expand);
736 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
737
739 MVT::v4i32, Expand);
740
742 MVT::v8i32, Expand);
743
744 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
745 Subtarget->hasVOP3PInsts() ? Legal : Custom);
746
747 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
748 // This isn't really legal, but this avoids the legalizer unrolling it (and
749 // allows matching fneg (fabs x) patterns)
750 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
751
752 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
753 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
754
755 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
759 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
779 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
780 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
781 MVT::v2f16, Legal);
782
783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784 Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
797 VT, Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
802 VT, Custom);
803
804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805 Custom);
806
807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
813 MVT::v2f32, Legal);
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Custom);
817 }
818 }
819
820 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
821
822 if (Subtarget->has16BitInsts()) {
824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830
831 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
832 }
833
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Custom);
840
842
843 if (Subtarget->hasScalarSMulU64())
845
846 if (Subtarget->hasMad64_32())
848
849 if (Subtarget->hasPrefetch())
850 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
851
852 if (Subtarget->hasIEEEMinMax()) {
853 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
855 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857 Custom);
858 }
859
861 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
863 MVT::i8},
864 Custom);
865
867 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
868 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
869 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
870 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
871 Custom);
872
874 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
875 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
876 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
878 Custom);
879
880 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
882 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
883 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
884 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
885
886 // TODO: Could move this to custom lowering, could benefit from combines on
887 // extract of relevant bits.
888 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
889
891
894 ISD::SUB,
896 ISD::FADD,
897 ISD::FSUB,
898 ISD::FDIV,
899 ISD::FMINNUM,
900 ISD::FMAXNUM,
901 ISD::FMINNUM_IEEE,
902 ISD::FMAXNUM_IEEE,
903 ISD::FMINIMUM,
904 ISD::FMAXIMUM,
905 ISD::FMA,
906 ISD::SMIN,
907 ISD::SMAX,
908 ISD::UMIN,
909 ISD::UMAX,
911 ISD::AND,
912 ISD::OR,
913 ISD::XOR,
914 ISD::FSHR,
924
925 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
927
928 // All memory operations. Some folding on the pointer operand is done to help
929 // matching the constant offsets in the addressing modes.
930 setTargetDAGCombine({ISD::LOAD,
931 ISD::STORE,
932 ISD::ATOMIC_LOAD,
933 ISD::ATOMIC_STORE,
934 ISD::ATOMIC_CMP_SWAP,
935 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
936 ISD::ATOMIC_SWAP,
937 ISD::ATOMIC_LOAD_ADD,
938 ISD::ATOMIC_LOAD_SUB,
939 ISD::ATOMIC_LOAD_AND,
940 ISD::ATOMIC_LOAD_OR,
941 ISD::ATOMIC_LOAD_XOR,
942 ISD::ATOMIC_LOAD_NAND,
943 ISD::ATOMIC_LOAD_MIN,
944 ISD::ATOMIC_LOAD_MAX,
945 ISD::ATOMIC_LOAD_UMIN,
946 ISD::ATOMIC_LOAD_UMAX,
947 ISD::ATOMIC_LOAD_FADD,
948 ISD::ATOMIC_LOAD_FMIN,
949 ISD::ATOMIC_LOAD_FMAX,
950 ISD::ATOMIC_LOAD_UINC_WRAP,
951 ISD::ATOMIC_LOAD_UDEC_WRAP,
954
955 // FIXME: In other contexts we pretend this is a per-function property.
957
959}
960
962 return Subtarget;
963}
964
966 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
967 return RCRegs;
968}
969
970//===----------------------------------------------------------------------===//
971// TargetLowering queries
972//===----------------------------------------------------------------------===//
973
974// v_mad_mix* support a conversion from f16 to f32.
975//
976// There is only one special case when denormals are enabled we don't currently,
977// where this is OK to use.
978bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
979 EVT DestVT, EVT SrcVT) const {
980 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
982 DestVT.getScalarType() == MVT::f32 &&
983 SrcVT.getScalarType() == MVT::f16 &&
984 // TODO: This probably only requires no input flushing?
986}
987
989 LLT DestTy, LLT SrcTy) const {
990 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
991 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
992 DestTy.getScalarSizeInBits() == 32 &&
993 SrcTy.getScalarSizeInBits() == 16 &&
994 // TODO: This probably only requires no input flushing?
996}
997
999 // SI has some legal vector types, but no legal vector operations. Say no
1000 // shuffles are legal in order to prefer scalarizing some vector operations.
1001 return false;
1002}
1003
1005 CallingConv::ID CC,
1006 EVT VT) const {
1009
1010 if (VT.isVector()) {
1011 EVT ScalarVT = VT.getScalarType();
1012 unsigned Size = ScalarVT.getSizeInBits();
1013 if (Size == 16) {
1014 if (Subtarget->has16BitInsts()) {
1015 if (VT.isInteger())
1016 return MVT::v2i16;
1017 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1018 }
1019 return VT.isInteger() ? MVT::i32 : MVT::f32;
1020 }
1021
1022 if (Size < 16)
1023 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1024 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1025 }
1026
1027 if (VT.getSizeInBits() > 32)
1028 return MVT::i32;
1029
1031}
1032
1034 CallingConv::ID CC,
1035 EVT VT) const {
1038
1039 if (VT.isVector()) {
1040 unsigned NumElts = VT.getVectorNumElements();
1041 EVT ScalarVT = VT.getScalarType();
1042 unsigned Size = ScalarVT.getSizeInBits();
1043
1044 // FIXME: Should probably promote 8-bit vectors to i16.
1045 if (Size == 16 && Subtarget->has16BitInsts())
1046 return (NumElts + 1) / 2;
1047
1048 if (Size <= 32)
1049 return NumElts;
1050
1051 if (Size > 32)
1052 return NumElts * ((Size + 31) / 32);
1053 } else if (VT.getSizeInBits() > 32)
1054 return (VT.getSizeInBits() + 31) / 32;
1055
1057}
1058
1060 LLVMContext &Context, CallingConv::ID CC,
1061 EVT VT, EVT &IntermediateVT,
1062 unsigned &NumIntermediates, MVT &RegisterVT) const {
1063 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1064 unsigned NumElts = VT.getVectorNumElements();
1065 EVT ScalarVT = VT.getScalarType();
1066 unsigned Size = ScalarVT.getSizeInBits();
1067 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1068 // support, but unless we can properly handle 3-vectors, it will be still be
1069 // inconsistent.
1070 if (Size == 16 && Subtarget->has16BitInsts()) {
1071 if (ScalarVT == MVT::bf16) {
1072 RegisterVT = MVT::i32;
1073 IntermediateVT = MVT::v2bf16;
1074 } else {
1075 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1076 IntermediateVT = RegisterVT;
1077 }
1078 NumIntermediates = (NumElts + 1) / 2;
1079 return NumIntermediates;
1080 }
1081
1082 if (Size == 32) {
1083 RegisterVT = ScalarVT.getSimpleVT();
1084 IntermediateVT = RegisterVT;
1085 NumIntermediates = NumElts;
1086 return NumIntermediates;
1087 }
1088
1089 if (Size < 16 && Subtarget->has16BitInsts()) {
1090 // FIXME: Should probably form v2i16 pieces
1091 RegisterVT = MVT::i16;
1092 IntermediateVT = ScalarVT;
1093 NumIntermediates = NumElts;
1094 return NumIntermediates;
1095 }
1096
1097
1098 if (Size != 16 && Size <= 32) {
1099 RegisterVT = MVT::i32;
1100 IntermediateVT = ScalarVT;
1101 NumIntermediates = NumElts;
1102 return NumIntermediates;
1103 }
1104
1105 if (Size > 32) {
1106 RegisterVT = MVT::i32;
1107 IntermediateVT = RegisterVT;
1108 NumIntermediates = NumElts * ((Size + 31) / 32);
1109 return NumIntermediates;
1110 }
1111 }
1112
1114 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1115}
1116
1118 const DataLayout &DL, Type *Ty,
1119 unsigned MaxNumLanes) {
1120 assert(MaxNumLanes != 0);
1121
1122 LLVMContext &Ctx = Ty->getContext();
1123 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1124 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1125 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1126 NumElts);
1127 }
1128
1129 return TLI.getValueType(DL, Ty);
1130}
1131
1132// Peek through TFE struct returns to only use the data size.
1134 const DataLayout &DL, Type *Ty,
1135 unsigned MaxNumLanes) {
1136 auto *ST = dyn_cast<StructType>(Ty);
1137 if (!ST)
1138 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1139
1140 // TFE intrinsics return an aggregate type.
1141 assert(ST->getNumContainedTypes() == 2 &&
1142 ST->getContainedType(1)->isIntegerTy(32));
1143 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1144}
1145
1146/// Map address space 7 to MVT::v5i32 because that's its in-memory
1147/// representation. This return value is vector-typed because there is no
1148/// MVT::i160 and it is not clear if one can be added. While this could
1149/// cause issues during codegen, these address space 7 pointers will be
1150/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1151/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1152/// modeling, to work.
1154 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1155 return MVT::v5i32;
1157 DL.getPointerSizeInBits(AS) == 192)
1158 return MVT::v6i32;
1160}
1161/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1162/// v8i32 when padding is added.
1163/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1164/// also v8i32 with padding.
1166 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1167 DL.getPointerSizeInBits(AS) == 160) ||
1169 DL.getPointerSizeInBits(AS) == 192))
1170 return MVT::v8i32;
1172}
1173
1175 const CallInst &CI,
1176 MachineFunction &MF,
1177 unsigned IntrID) const {
1178 Info.flags = MachineMemOperand::MONone;
1179 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1180 Info.flags |= MachineMemOperand::MOInvariant;
1181
1182 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1185 (Intrinsic::ID)IntrID);
1186 MemoryEffects ME = Attr.getMemoryEffects();
1187 if (ME.doesNotAccessMemory())
1188 return false;
1189
1190 // TODO: Should images get their own address space?
1191 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1192
1193 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1194 if (RsrcIntr->IsImage) {
1197 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1198 Info.align.reset();
1199 }
1200
1201 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1202 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1203 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1204 // We conservatively set the memory operand of a buffer intrinsic to the
1205 // base resource pointer, so that we can access alias information about
1206 // those pointers. Cases like "this points at the same value
1207 // but with a different offset" are handled in
1208 // areMemAccessesTriviallyDisjoint.
1209 Info.ptrVal = RsrcArg;
1210 }
1211
1212 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1213 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1214 Info.flags |= MachineMemOperand::MOVolatile;
1216 if (ME.onlyReadsMemory()) {
1217 if (RsrcIntr->IsImage) {
1218 unsigned MaxNumLanes = 4;
1219
1220 if (!BaseOpcode->Gather4) {
1221 // If this isn't a gather, we may have excess loaded elements in the
1222 // IR type. Check the dmask for the real number of elements loaded.
1223 unsigned DMask
1224 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1225 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1226 }
1227
1228 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1229 CI.getType(), MaxNumLanes);
1230 } else {
1231 Info.memVT =
1233 std::numeric_limits<unsigned>::max());
1234 }
1235
1236 // FIXME: What does alignment mean for an image?
1237 Info.opc = ISD::INTRINSIC_W_CHAIN;
1238 Info.flags |= MachineMemOperand::MOLoad;
1239 } else if (ME.onlyWritesMemory()) {
1240 Info.opc = ISD::INTRINSIC_VOID;
1241
1242 Type *DataTy = CI.getArgOperand(0)->getType();
1243 if (RsrcIntr->IsImage) {
1244 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1245 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1246 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1247 DMaskLanes);
1248 } else
1249 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1250
1251 Info.flags |= MachineMemOperand::MOStore;
1252 } else {
1253 // Atomic or NoReturn Sampler
1254 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1256 Info.flags |= MachineMemOperand::MOLoad |
1259
1260 switch (IntrID) {
1261 default:
1262 if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
1263 // Fake memory access type for no return sampler intrinsics
1264 Info.memVT = MVT::i32;
1265 } else {
1266 // XXX - Should this be volatile without known ordering?
1267 Info.flags |= MachineMemOperand::MOVolatile;
1268 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1269 }
1270 break;
1271 case Intrinsic::amdgcn_raw_buffer_load_lds:
1272 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1273 case Intrinsic::amdgcn_struct_buffer_load_lds:
1274 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1275 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1276 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1277 Info.ptrVal = CI.getArgOperand(1);
1278 return true;
1279 }
1280 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1281 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: {
1282 Info.memVT =
1284 std::numeric_limits<unsigned>::max());
1285 Info.flags &= ~MachineMemOperand::MOStore;
1286 return true;
1287 }
1288 }
1289 }
1290 return true;
1291 }
1292
1293 switch (IntrID) {
1294 case Intrinsic::amdgcn_ds_ordered_add:
1295 case Intrinsic::amdgcn_ds_ordered_swap: {
1296 Info.opc = ISD::INTRINSIC_W_CHAIN;
1297 Info.memVT = MVT::getVT(CI.getType());
1298 Info.ptrVal = CI.getOperand(0);
1299 Info.align.reset();
1301
1302 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1303 if (!Vol->isZero())
1304 Info.flags |= MachineMemOperand::MOVolatile;
1305
1306 return true;
1307 }
1308 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1309 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1310 Info.opc = ISD::INTRINSIC_W_CHAIN;
1311 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1312 Info.ptrVal = nullptr;
1313 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1315 return true;
1316 }
1317 case Intrinsic::amdgcn_ds_append:
1318 case Intrinsic::amdgcn_ds_consume: {
1319 Info.opc = ISD::INTRINSIC_W_CHAIN;
1320 Info.memVT = MVT::getVT(CI.getType());
1321 Info.ptrVal = CI.getOperand(0);
1322 Info.align.reset();
1324
1325 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1326 if (!Vol->isZero())
1327 Info.flags |= MachineMemOperand::MOVolatile;
1328
1329 return true;
1330 }
1331 case Intrinsic::amdgcn_global_atomic_csub: {
1332 Info.opc = ISD::INTRINSIC_W_CHAIN;
1333 Info.memVT = MVT::getVT(CI.getType());
1334 Info.ptrVal = CI.getOperand(0);
1335 Info.align.reset();
1336 Info.flags |= MachineMemOperand::MOLoad |
1339 return true;
1340 }
1341 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1342 Info.opc = ISD::INTRINSIC_W_CHAIN;
1343 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1344
1345 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1346 Info.align.reset();
1347 Info.flags |= MachineMemOperand::MOLoad |
1349 return true;
1350 }
1351 case Intrinsic::amdgcn_global_atomic_fadd:
1352 case Intrinsic::amdgcn_global_atomic_fmin:
1353 case Intrinsic::amdgcn_global_atomic_fmax:
1354 case Intrinsic::amdgcn_global_atomic_fmin_num:
1355 case Intrinsic::amdgcn_global_atomic_fmax_num:
1356 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1357 case Intrinsic::amdgcn_flat_atomic_fadd:
1358 case Intrinsic::amdgcn_flat_atomic_fmin:
1359 case Intrinsic::amdgcn_flat_atomic_fmax:
1360 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1361 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1362 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1363 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1364 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1365 Info.opc = ISD::INTRINSIC_W_CHAIN;
1366 Info.memVT = MVT::getVT(CI.getType());
1367 Info.ptrVal = CI.getOperand(0);
1368 Info.align.reset();
1369 Info.flags |= MachineMemOperand::MOLoad |
1373 return true;
1374 }
1375 case Intrinsic::amdgcn_global_load_tr_b64:
1376 case Intrinsic::amdgcn_global_load_tr_b128: {
1377 Info.opc = ISD::INTRINSIC_W_CHAIN;
1378 Info.memVT = MVT::getVT(CI.getType());
1379 Info.ptrVal = CI.getOperand(0);
1380 Info.align.reset();
1381 Info.flags |= MachineMemOperand::MOLoad;
1382 return true;
1383 }
1384 case Intrinsic::amdgcn_ds_gws_init:
1385 case Intrinsic::amdgcn_ds_gws_barrier:
1386 case Intrinsic::amdgcn_ds_gws_sema_v:
1387 case Intrinsic::amdgcn_ds_gws_sema_br:
1388 case Intrinsic::amdgcn_ds_gws_sema_p:
1389 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1390 Info.opc = ISD::INTRINSIC_VOID;
1391
1392 const GCNTargetMachine &TM =
1393 static_cast<const GCNTargetMachine &>(getTargetMachine());
1394
1396 Info.ptrVal = MFI->getGWSPSV(TM);
1397
1398 // This is an abstract access, but we need to specify a type and size.
1399 Info.memVT = MVT::i32;
1400 Info.size = 4;
1401 Info.align = Align(4);
1402
1403 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1404 Info.flags |= MachineMemOperand::MOLoad;
1405 else
1406 Info.flags |= MachineMemOperand::MOStore;
1407 return true;
1408 }
1409 case Intrinsic::amdgcn_global_load_lds: {
1410 Info.opc = ISD::INTRINSIC_VOID;
1411 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1412 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1413 Info.ptrVal = CI.getArgOperand(1);
1415 return true;
1416 }
1417 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1418 Info.opc = ISD::INTRINSIC_W_CHAIN;
1419
1420 const GCNTargetMachine &TM =
1421 static_cast<const GCNTargetMachine &>(getTargetMachine());
1422
1424 Info.ptrVal = MFI->getGWSPSV(TM);
1425
1426 // This is an abstract access, but we need to specify a type and size.
1427 Info.memVT = MVT::i32;
1428 Info.size = 4;
1429 Info.align = Align(4);
1430
1432 return true;
1433 }
1434 default:
1435 return false;
1436 }
1437}
1438
1440 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1442 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1443 // The DAG's ValueType loses the addrspaces.
1444 // Add them as 2 extra Constant operands "from" and "to".
1445 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1446 unsigned DstAS = I.getType()->getPointerAddressSpace();
1447 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1448 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1449 break;
1450 }
1451 default:
1452 break;
1453 }
1454}
1455
1458 Type *&AccessTy) const {
1459 Value *Ptr = nullptr;
1460 switch (II->getIntrinsicID()) {
1461 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1462 case Intrinsic::amdgcn_ds_append:
1463 case Intrinsic::amdgcn_ds_consume:
1464 case Intrinsic::amdgcn_ds_ordered_add:
1465 case Intrinsic::amdgcn_ds_ordered_swap:
1466 case Intrinsic::amdgcn_flat_atomic_fadd:
1467 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1468 case Intrinsic::amdgcn_flat_atomic_fmax:
1469 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1470 case Intrinsic::amdgcn_flat_atomic_fmin:
1471 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1472 case Intrinsic::amdgcn_global_atomic_csub:
1473 case Intrinsic::amdgcn_global_atomic_fadd:
1474 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1475 case Intrinsic::amdgcn_global_atomic_fmax:
1476 case Intrinsic::amdgcn_global_atomic_fmax_num:
1477 case Intrinsic::amdgcn_global_atomic_fmin:
1478 case Intrinsic::amdgcn_global_atomic_fmin_num:
1479 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1480 case Intrinsic::amdgcn_global_load_tr_b64:
1481 case Intrinsic::amdgcn_global_load_tr_b128:
1482 Ptr = II->getArgOperand(0);
1483 break;
1484 case Intrinsic::amdgcn_global_load_lds:
1485 Ptr = II->getArgOperand(1);
1486 break;
1487 default:
1488 return false;
1489 }
1490 AccessTy = II->getType();
1491 Ops.push_back(Ptr);
1492 return true;
1493}
1494
1496 unsigned AddrSpace) const {
1497 if (!Subtarget->hasFlatInstOffsets()) {
1498 // Flat instructions do not have offsets, and only have the register
1499 // address.
1500 return AM.BaseOffs == 0 && AM.Scale == 0;
1501 }
1502
1503 decltype(SIInstrFlags::FLAT) FlatVariant =
1507
1508 return AM.Scale == 0 &&
1509 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1510 AM.BaseOffs, AddrSpace, FlatVariant));
1511}
1512
1514 if (Subtarget->hasFlatGlobalInsts())
1516
1517 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1518 // Assume the we will use FLAT for all global memory accesses
1519 // on VI.
1520 // FIXME: This assumption is currently wrong. On VI we still use
1521 // MUBUF instructions for the r + i addressing mode. As currently
1522 // implemented, the MUBUF instructions only work on buffer < 4GB.
1523 // It may be possible to support > 4GB buffers with MUBUF instructions,
1524 // by setting the stride value in the resource descriptor which would
1525 // increase the size limit to (stride * 4GB). However, this is risky,
1526 // because it has never been validated.
1528 }
1529
1530 return isLegalMUBUFAddressingMode(AM);
1531}
1532
1533bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1534 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1535 // additionally can do r + r + i with addr64. 32-bit has more addressing
1536 // mode options. Depending on the resource constant, it can also do
1537 // (i64 r0) + (i32 r1) * (i14 i).
1538 //
1539 // Private arrays end up using a scratch buffer most of the time, so also
1540 // assume those use MUBUF instructions. Scratch loads / stores are currently
1541 // implemented as mubuf instructions with offen bit set, so slightly
1542 // different than the normal addr64.
1543 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1544 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1545 return false;
1546
1547 // FIXME: Since we can split immediate into soffset and immediate offset,
1548 // would it make sense to allow any immediate?
1549
1550 switch (AM.Scale) {
1551 case 0: // r + i or just i, depending on HasBaseReg.
1552 return true;
1553 case 1:
1554 return true; // We have r + r or r + i.
1555 case 2:
1556 if (AM.HasBaseReg) {
1557 // Reject 2 * r + r.
1558 return false;
1559 }
1560
1561 // Allow 2 * r as r + r
1562 // Or 2 * r + i is allowed as r + r + i.
1563 return true;
1564 default: // Don't allow n * r
1565 return false;
1566 }
1567}
1568
1570 const AddrMode &AM, Type *Ty,
1571 unsigned AS, Instruction *I) const {
1572 // No global is ever allowed as a base.
1573 if (AM.BaseGV)
1574 return false;
1575
1576 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1577 return isLegalGlobalAddressingMode(AM);
1578
1579 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1583 // If the offset isn't a multiple of 4, it probably isn't going to be
1584 // correctly aligned.
1585 // FIXME: Can we get the real alignment here?
1586 if (AM.BaseOffs % 4 != 0)
1587 return isLegalMUBUFAddressingMode(AM);
1588
1589 if (!Subtarget->hasScalarSubwordLoads()) {
1590 // There are no SMRD extloads, so if we have to do a small type access we
1591 // will use a MUBUF load.
1592 // FIXME?: We also need to do this if unaligned, but we don't know the
1593 // alignment here.
1594 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1595 return isLegalGlobalAddressingMode(AM);
1596 }
1597
1599 // SMRD instructions have an 8-bit, dword offset on SI.
1600 if (!isUInt<8>(AM.BaseOffs / 4))
1601 return false;
1602 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1603 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1604 // in 8-bits, it can use a smaller encoding.
1605 if (!isUInt<32>(AM.BaseOffs / 4))
1606 return false;
1607 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1608 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1609 if (!isUInt<20>(AM.BaseOffs))
1610 return false;
1611 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1612 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1613 // for S_BUFFER_* instructions).
1614 if (!isInt<21>(AM.BaseOffs))
1615 return false;
1616 } else {
1617 // On GFX12, all offsets are signed 24-bit in bytes.
1618 if (!isInt<24>(AM.BaseOffs))
1619 return false;
1620 }
1621
1622 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1624 AM.BaseOffs < 0) {
1625 // Scalar (non-buffer) loads can only use a negative offset if
1626 // soffset+offset is non-negative. Since the compiler can only prove that
1627 // in a few special cases, it is safer to claim that negative offsets are
1628 // not supported.
1629 return false;
1630 }
1631
1632 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1633 return true;
1634
1635 if (AM.Scale == 1 && AM.HasBaseReg)
1636 return true;
1637
1638 return false;
1639 }
1640
1641 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1642 return Subtarget->enableFlatScratch()
1644 : isLegalMUBUFAddressingMode(AM);
1645
1646 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1647 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1648 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1649 // field.
1650 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1651 // an 8-bit dword offset but we don't know the alignment here.
1652 if (!isUInt<16>(AM.BaseOffs))
1653 return false;
1654
1655 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1656 return true;
1657
1658 if (AM.Scale == 1 && AM.HasBaseReg)
1659 return true;
1660
1661 return false;
1662 }
1663
1665 // For an unknown address space, this usually means that this is for some
1666 // reason being used for pure arithmetic, and not based on some addressing
1667 // computation. We don't have instructions that compute pointers with any
1668 // addressing modes, so treat them as having no offset like flat
1669 // instructions.
1671 }
1672
1673 // Assume a user alias of global for unknown address spaces.
1674 return isLegalGlobalAddressingMode(AM);
1675}
1676
1678 const MachineFunction &MF) const {
1680 return (MemVT.getSizeInBits() <= 4 * 32);
1681 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1682 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1683 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1684 }
1686 return (MemVT.getSizeInBits() <= 2 * 32);
1687 return true;
1688}
1689
1691 unsigned Size, unsigned AddrSpace, Align Alignment,
1692 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1693 if (IsFast)
1694 *IsFast = 0;
1695
1696 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1697 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1698 // Check if alignment requirements for ds_read/write instructions are
1699 // disabled.
1700 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1701 return false;
1702
1703 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1704 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1705 Alignment < RequiredAlignment)
1706 return false;
1707
1708 // Either, the alignment requirements are "enabled", or there is an
1709 // unaligned LDS access related hardware bug though alignment requirements
1710 // are "disabled". In either case, we need to check for proper alignment
1711 // requirements.
1712 //
1713 switch (Size) {
1714 case 64:
1715 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1716 // address is negative, then the instruction is incorrectly treated as
1717 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1718 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1719 // load later in the SILoadStoreOptimizer.
1720 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1721 return false;
1722
1723 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1724 // can do a 4 byte aligned, 8 byte access in a single operation using
1725 // ds_read2/write2_b32 with adjacent offsets.
1726 RequiredAlignment = Align(4);
1727
1728 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1729 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1730 // ds_write2_b32 depending on the alignment. In either case with either
1731 // alignment there is no faster way of doing this.
1732
1733 // The numbers returned here and below are not additive, it is a 'speed
1734 // rank'. They are just meant to be compared to decide if a certain way
1735 // of lowering an operation is faster than another. For that purpose
1736 // naturally aligned operation gets it bitsize to indicate that "it
1737 // operates with a speed comparable to N-bit wide load". With the full
1738 // alignment ds128 is slower than ds96 for example. If underaligned it
1739 // is comparable to a speed of a single dword access, which would then
1740 // mean 32 < 128 and it is faster to issue a wide load regardless.
1741 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1742 // wider load which will not be aligned anymore the latter is slower.
1743 if (IsFast)
1744 *IsFast = (Alignment >= RequiredAlignment) ? 64
1745 : (Alignment < Align(4)) ? 32
1746 : 1;
1747 return true;
1748 }
1749
1750 break;
1751 case 96:
1752 if (!Subtarget->hasDS96AndDS128())
1753 return false;
1754
1755 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1756 // gfx8 and older.
1757
1758 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1759 // Naturally aligned access is fastest. However, also report it is Fast
1760 // if memory is aligned less than DWORD. A narrow load or store will be
1761 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1762 // be more of them, so overall we will pay less penalty issuing a single
1763 // instruction.
1764
1765 // See comment on the values above.
1766 if (IsFast)
1767 *IsFast = (Alignment >= RequiredAlignment) ? 96
1768 : (Alignment < Align(4)) ? 32
1769 : 1;
1770 return true;
1771 }
1772
1773 break;
1774 case 128:
1775 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1776 return false;
1777
1778 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1779 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1780 // single operation using ds_read2/write2_b64.
1781 RequiredAlignment = Align(8);
1782
1783 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1784 // Naturally aligned access is fastest. However, also report it is Fast
1785 // if memory is aligned less than DWORD. A narrow load or store will be
1786 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1787 // will be more of them, so overall we will pay less penalty issuing a
1788 // single instruction.
1789
1790 // See comment on the values above.
1791 if (IsFast)
1792 *IsFast = (Alignment >= RequiredAlignment) ? 128
1793 : (Alignment < Align(4)) ? 32
1794 : 1;
1795 return true;
1796 }
1797
1798 break;
1799 default:
1800 if (Size > 32)
1801 return false;
1802
1803 break;
1804 }
1805
1806 // See comment on the values above.
1807 // Note that we have a single-dword or sub-dword here, so if underaligned
1808 // it is a slowest possible access, hence returned value is 0.
1809 if (IsFast)
1810 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1811
1812 return Alignment >= RequiredAlignment ||
1813 Subtarget->hasUnalignedDSAccessEnabled();
1814 }
1815
1816 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1817 bool AlignedBy4 = Alignment >= Align(4);
1818 if (IsFast)
1819 *IsFast = AlignedBy4;
1820
1821 return AlignedBy4 ||
1822 Subtarget->enableFlatScratch() ||
1823 Subtarget->hasUnalignedScratchAccess();
1824 }
1825
1826 // FIXME: We have to be conservative here and assume that flat operations
1827 // will access scratch. If we had access to the IR function, then we
1828 // could determine if any private memory was used in the function.
1829 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1830 !Subtarget->hasUnalignedScratchAccess()) {
1831 bool AlignedBy4 = Alignment >= Align(4);
1832 if (IsFast)
1833 *IsFast = AlignedBy4;
1834
1835 return AlignedBy4;
1836 }
1837
1838 // So long as they are correct, wide global memory operations perform better
1839 // than multiple smaller memory ops -- even when misaligned
1840 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1841 if (IsFast)
1842 *IsFast = Size;
1843
1844 return Alignment >= Align(4) ||
1846 }
1847
1848 // Smaller than dword value must be aligned.
1849 if (Size < 32)
1850 return false;
1851
1852 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1853 // byte-address are ignored, thus forcing Dword alignment.
1854 // This applies to private, global, and constant memory.
1855 if (IsFast)
1856 *IsFast = 1;
1857
1858 return Size >= 32 && Alignment >= Align(4);
1859}
1860
1862 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1863 unsigned *IsFast) const {
1865 Alignment, Flags, IsFast);
1866}
1867
1869 const MemOp &Op, const AttributeList &FuncAttributes) const {
1870 // FIXME: Should account for address space here.
1871
1872 // The default fallback uses the private pointer size as a guess for a type to
1873 // use. Make sure we switch these to 64-bit accesses.
1874
1875 if (Op.size() >= 16 &&
1876 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1877 return MVT::v4i32;
1878
1879 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1880 return MVT::v2i32;
1881
1882 // Use the default.
1883 return MVT::Other;
1884}
1885
1887 const MemSDNode *MemNode = cast<MemSDNode>(N);
1888 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1889}
1890
1895
1897 unsigned DestAS) const {
1898 // Flat -> private/local is a simple truncate.
1899 // Flat -> global is no-op
1900 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1901 return true;
1902
1903 const GCNTargetMachine &TM =
1904 static_cast<const GCNTargetMachine &>(getTargetMachine());
1905 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1906}
1907
1909 const MemSDNode *MemNode = cast<MemSDNode>(N);
1910
1912}
1913
1921
1923 Type *Ty) const {
1924 // FIXME: Could be smarter if called for vector constants.
1925 return true;
1926}
1927
1929 unsigned Index) const {
1931 return false;
1932
1933 // TODO: Add more cases that are cheap.
1934 return Index == 0;
1935}
1936
1938 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1939 switch (Op) {
1940 case ISD::LOAD:
1941 case ISD::STORE:
1942
1943 // These operations are done with 32-bit instructions anyway.
1944 case ISD::AND:
1945 case ISD::OR:
1946 case ISD::XOR:
1947 case ISD::SELECT:
1948 // TODO: Extensions?
1949 return true;
1950 default:
1951 return false;
1952 }
1953 }
1954
1955 // SimplifySetCC uses this function to determine whether or not it should
1956 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1957 if (VT == MVT::i1 && Op == ISD::SETCC)
1958 return false;
1959
1961}
1962
1963SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1964 const SDLoc &SL,
1965 SDValue Chain,
1966 uint64_t Offset) const {
1967 const DataLayout &DL = DAG.getDataLayout();
1970
1971 const ArgDescriptor *InputPtrReg;
1972 const TargetRegisterClass *RC;
1973 LLT ArgTy;
1975
1976 std::tie(InputPtrReg, RC, ArgTy) =
1977 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1978
1979 // We may not have the kernarg segment argument if we have no kernel
1980 // arguments.
1981 if (!InputPtrReg)
1982 return DAG.getConstant(Offset, SL, PtrVT);
1983
1985 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1986 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1987
1988 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1989}
1990
1991SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1992 const SDLoc &SL) const {
1995 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1996}
1997
1998SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1999 const SDLoc &SL) const {
2000
2002 std::optional<uint32_t> KnownSize =
2004 if (KnownSize.has_value())
2005 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2006 return SDValue();
2007}
2008
2009SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2010 const SDLoc &SL, SDValue Val,
2011 bool Signed,
2012 const ISD::InputArg *Arg) const {
2013 // First, if it is a widened vector, narrow it.
2014 if (VT.isVector() &&
2016 EVT NarrowedVT =
2019 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2020 DAG.getConstant(0, SL, MVT::i32));
2021 }
2022
2023 // Then convert the vector elements or scalar value.
2024 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2025 VT.bitsLT(MemVT)) {
2026 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2027 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2028 }
2029
2030 if (MemVT.isFloatingPoint())
2031 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2032 else if (Signed)
2033 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2034 else
2035 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2036
2037 return Val;
2038}
2039
2040SDValue SITargetLowering::lowerKernargMemParameter(
2041 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2042 uint64_t Offset, Align Alignment, bool Signed,
2043 const ISD::InputArg *Arg) const {
2045
2046 // Try to avoid using an extload by loading earlier than the argument address,
2047 // and extracting the relevant bits. The load should hopefully be merged with
2048 // the previous argument.
2049 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2050 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2051 int64_t AlignDownOffset = alignDown(Offset, 4);
2052 int64_t OffsetDiff = Offset - AlignDownOffset;
2053
2054 EVT IntVT = MemVT.changeTypeToInteger();
2055
2056 // TODO: If we passed in the base kernel offset we could have a better
2057 // alignment than 4, but we don't really need it.
2058 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2059 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2062
2063 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2064 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2065
2066 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2067 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2068 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2069
2070
2071 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2072 }
2073
2074 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2075 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2078
2079 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2080 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2081}
2082
2083SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2084 const SDLoc &SL, SDValue Chain,
2085 const ISD::InputArg &Arg) const {
2087 MachineFrameInfo &MFI = MF.getFrameInfo();
2088
2089 if (Arg.Flags.isByVal()) {
2090 unsigned Size = Arg.Flags.getByValSize();
2091 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2092 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2093 }
2094
2095 unsigned ArgOffset = VA.getLocMemOffset();
2096 unsigned ArgSize = VA.getValVT().getStoreSize();
2097
2098 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2099
2100 // Create load nodes to retrieve arguments from the stack.
2101 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2102 SDValue ArgValue;
2103
2104 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2106 MVT MemVT = VA.getValVT();
2107
2108 switch (VA.getLocInfo()) {
2109 default:
2110 break;
2111 case CCValAssign::BCvt:
2112 MemVT = VA.getLocVT();
2113 break;
2114 case CCValAssign::SExt:
2115 ExtType = ISD::SEXTLOAD;
2116 break;
2117 case CCValAssign::ZExt:
2118 ExtType = ISD::ZEXTLOAD;
2119 break;
2120 case CCValAssign::AExt:
2121 ExtType = ISD::EXTLOAD;
2122 break;
2123 }
2124
2125 ArgValue = DAG.getExtLoad(
2126 ExtType, SL, VA.getLocVT(), Chain, FIN,
2128 MemVT);
2129 return ArgValue;
2130}
2131
2132SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2133 const SIMachineFunctionInfo &MFI,
2134 EVT VT,
2136 const ArgDescriptor *Reg = nullptr;
2137 const TargetRegisterClass *RC;
2138 LLT Ty;
2139
2140 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2141 const ArgDescriptor WorkGroupIDX =
2142 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2143 // If GridZ is not programmed in an entry function then the hardware will set
2144 // it to all zeros, so there is no need to mask the GridY value in the low
2145 // order bits.
2146 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2147 AMDGPU::TTMP7,
2148 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2149 const ArgDescriptor WorkGroupIDZ =
2150 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2151 if (Subtarget->hasArchitectedSGPRs() &&
2153 switch (PVID) {
2155 Reg = &WorkGroupIDX;
2156 RC = &AMDGPU::SReg_32RegClass;
2157 Ty = LLT::scalar(32);
2158 break;
2160 Reg = &WorkGroupIDY;
2161 RC = &AMDGPU::SReg_32RegClass;
2162 Ty = LLT::scalar(32);
2163 break;
2165 Reg = &WorkGroupIDZ;
2166 RC = &AMDGPU::SReg_32RegClass;
2167 Ty = LLT::scalar(32);
2168 break;
2169 default:
2170 break;
2171 }
2172 }
2173
2174 if (!Reg)
2175 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2176 if (!Reg) {
2178 // It's possible for a kernarg intrinsic call to appear in a kernel with
2179 // no allocated segment, in which case we do not add the user sgpr
2180 // argument, so just return null.
2181 return DAG.getConstant(0, SDLoc(), VT);
2182 }
2183
2184 // It's undefined behavior if a function marked with the amdgpu-no-*
2185 // attributes uses the corresponding intrinsic.
2186 return DAG.getUNDEF(VT);
2187 }
2188
2189 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2190}
2191
2193 CallingConv::ID CallConv,
2194 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2195 FunctionType *FType,
2196 SIMachineFunctionInfo *Info) {
2197 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2198 const ISD::InputArg *Arg = &Ins[I];
2199
2200 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2201 "vector type argument should have been split");
2202
2203 // First check if it's a PS input addr.
2204 if (CallConv == CallingConv::AMDGPU_PS &&
2205 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2206 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2207
2208 // Inconveniently only the first part of the split is marked as isSplit,
2209 // so skip to the end. We only want to increment PSInputNum once for the
2210 // entire split argument.
2211 if (Arg->Flags.isSplit()) {
2212 while (!Arg->Flags.isSplitEnd()) {
2213 assert((!Arg->VT.isVector() ||
2214 Arg->VT.getScalarSizeInBits() == 16) &&
2215 "unexpected vector split in ps argument type");
2216 if (!SkipArg)
2217 Splits.push_back(*Arg);
2218 Arg = &Ins[++I];
2219 }
2220 }
2221
2222 if (SkipArg) {
2223 // We can safely skip PS inputs.
2224 Skipped.set(Arg->getOrigArgIndex());
2225 ++PSInputNum;
2226 continue;
2227 }
2228
2229 Info->markPSInputAllocated(PSInputNum);
2230 if (Arg->Used)
2231 Info->markPSInputEnabled(PSInputNum);
2232
2233 ++PSInputNum;
2234 }
2235
2236 Splits.push_back(*Arg);
2237 }
2238}
2239
2240// Allocate special inputs passed in VGPRs.
2242 MachineFunction &MF,
2243 const SIRegisterInfo &TRI,
2244 SIMachineFunctionInfo &Info) const {
2245 const LLT S32 = LLT::scalar(32);
2247
2248 if (Info.hasWorkItemIDX()) {
2249 Register Reg = AMDGPU::VGPR0;
2250 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2251
2252 CCInfo.AllocateReg(Reg);
2253 unsigned Mask = (Subtarget->hasPackedTID() &&
2254 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2255 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2256 }
2257
2258 if (Info.hasWorkItemIDY()) {
2259 assert(Info.hasWorkItemIDX());
2260 if (Subtarget->hasPackedTID()) {
2261 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2262 0x3ff << 10));
2263 } else {
2264 unsigned Reg = AMDGPU::VGPR1;
2265 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2266
2267 CCInfo.AllocateReg(Reg);
2268 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2269 }
2270 }
2271
2272 if (Info.hasWorkItemIDZ()) {
2273 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2274 if (Subtarget->hasPackedTID()) {
2275 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2276 0x3ff << 20));
2277 } else {
2278 unsigned Reg = AMDGPU::VGPR2;
2279 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2280
2281 CCInfo.AllocateReg(Reg);
2282 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2283 }
2284 }
2285}
2286
2287// Try to allocate a VGPR at the end of the argument list, or if no argument
2288// VGPRs are left allocating a stack slot.
2289// If \p Mask is is given it indicates bitfield position in the register.
2290// If \p Arg is given use it with new ]p Mask instead of allocating new.
2291static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2292 ArgDescriptor Arg = ArgDescriptor()) {
2293 if (Arg.isSet())
2294 return ArgDescriptor::createArg(Arg, Mask);
2295
2296 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2297 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2298 if (RegIdx == ArgVGPRs.size()) {
2299 // Spill to stack required.
2300 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2301
2302 return ArgDescriptor::createStack(Offset, Mask);
2303 }
2304
2305 unsigned Reg = ArgVGPRs[RegIdx];
2306 Reg = CCInfo.AllocateReg(Reg);
2307 assert(Reg != AMDGPU::NoRegister);
2308
2309 MachineFunction &MF = CCInfo.getMachineFunction();
2310 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2311 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2312 return ArgDescriptor::createRegister(Reg, Mask);
2313}
2314
2316 const TargetRegisterClass *RC,
2317 unsigned NumArgRegs) {
2318 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2319 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2320 if (RegIdx == ArgSGPRs.size())
2321 report_fatal_error("ran out of SGPRs for arguments");
2322
2323 unsigned Reg = ArgSGPRs[RegIdx];
2324 Reg = CCInfo.AllocateReg(Reg);
2325 assert(Reg != AMDGPU::NoRegister);
2326
2327 MachineFunction &MF = CCInfo.getMachineFunction();
2328 MF.addLiveIn(Reg, RC);
2330}
2331
2332// If this has a fixed position, we still should allocate the register in the
2333// CCInfo state. Technically we could get away with this for values passed
2334// outside of the normal argument range.
2336 const TargetRegisterClass *RC,
2337 MCRegister Reg) {
2338 Reg = CCInfo.AllocateReg(Reg);
2339 assert(Reg != AMDGPU::NoRegister);
2340 MachineFunction &MF = CCInfo.getMachineFunction();
2341 MF.addLiveIn(Reg, RC);
2342}
2343
2344static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2345 if (Arg) {
2346 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2347 Arg.getRegister());
2348 } else
2349 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2350}
2351
2352static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2353 if (Arg) {
2354 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2355 Arg.getRegister());
2356 } else
2357 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2358}
2359
2360/// Allocate implicit function VGPR arguments at the end of allocated user
2361/// arguments.
2363 CCState &CCInfo, MachineFunction &MF,
2364 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2365 const unsigned Mask = 0x3ff;
2366 ArgDescriptor Arg;
2367
2368 if (Info.hasWorkItemIDX()) {
2369 Arg = allocateVGPR32Input(CCInfo, Mask);
2370 Info.setWorkItemIDX(Arg);
2371 }
2372
2373 if (Info.hasWorkItemIDY()) {
2374 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2375 Info.setWorkItemIDY(Arg);
2376 }
2377
2378 if (Info.hasWorkItemIDZ())
2379 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2380}
2381
2382/// Allocate implicit function VGPR arguments in fixed registers.
2384 CCState &CCInfo, MachineFunction &MF,
2385 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2386 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2387 if (!Reg)
2388 report_fatal_error("failed to allocated VGPR for implicit arguments");
2389
2390 const unsigned Mask = 0x3ff;
2391 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2392 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2393 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2394}
2395
2397 CCState &CCInfo,
2398 MachineFunction &MF,
2399 const SIRegisterInfo &TRI,
2400 SIMachineFunctionInfo &Info) const {
2401 auto &ArgInfo = Info.getArgInfo();
2402 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2403
2404 // TODO: Unify handling with private memory pointers.
2405 if (UserSGPRInfo.hasDispatchPtr())
2406 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2407
2408 const Module *M = MF.getFunction().getParent();
2409 if (UserSGPRInfo.hasQueuePtr() &&
2411 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2412
2413 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2414 // constant offset from the kernarg segment.
2415 if (Info.hasImplicitArgPtr())
2416 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2417
2418 if (UserSGPRInfo.hasDispatchID())
2419 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2420
2421 // flat_scratch_init is not applicable for non-kernel functions.
2422
2423 if (Info.hasWorkGroupIDX())
2424 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2425
2426 if (Info.hasWorkGroupIDY())
2427 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2428
2429 if (Info.hasWorkGroupIDZ())
2430 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2431
2432 if (Info.hasLDSKernelId())
2433 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2434}
2435
2436// Allocate special inputs passed in user SGPRs.
2438 MachineFunction &MF,
2439 const SIRegisterInfo &TRI,
2440 SIMachineFunctionInfo &Info) const {
2441 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2442 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2443 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2444 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2445 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2446 }
2447
2448 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2449 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2450 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2451 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2452 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2453 }
2454
2455 if (UserSGPRInfo.hasDispatchPtr()) {
2456 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2457 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2458 CCInfo.AllocateReg(DispatchPtrReg);
2459 }
2460
2461 const Module *M = MF.getFunction().getParent();
2462 if (UserSGPRInfo.hasQueuePtr() &&
2464 Register QueuePtrReg = Info.addQueuePtr(TRI);
2465 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2466 CCInfo.AllocateReg(QueuePtrReg);
2467 }
2468
2469 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2471 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2472 CCInfo.AllocateReg(InputPtrReg);
2473
2474 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2475 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2476 }
2477
2478 if (UserSGPRInfo.hasDispatchID()) {
2479 Register DispatchIDReg = Info.addDispatchID(TRI);
2480 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2481 CCInfo.AllocateReg(DispatchIDReg);
2482 }
2483
2484 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2485 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2486 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2487 CCInfo.AllocateReg(FlatScratchInitReg);
2488 }
2489
2490 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2491 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2492 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2493 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2494 }
2495
2496 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2497 // these from the dispatch pointer.
2498}
2499
2500// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2501// sequential starting from the first argument.
2503 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2505 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2506 Function &F = MF.getFunction();
2507 unsigned LastExplicitArgOffset =
2508 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2509 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2510 bool InPreloadSequence = true;
2511 unsigned InIdx = 0;
2512 for (auto &Arg : F.args()) {
2513 if (!InPreloadSequence || !Arg.hasInRegAttr())
2514 break;
2515
2516 int ArgIdx = Arg.getArgNo();
2517 // Don't preload non-original args or parts not in the current preload
2518 // sequence.
2519 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2520 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2521 break;
2522
2523 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2524 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2525 InIdx++) {
2526 assert(ArgLocs[ArgIdx].isMemLoc());
2527 auto &ArgLoc = ArgLocs[InIdx];
2528 const Align KernelArgBaseAlign = Align(16);
2529 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2530 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2531 unsigned NumAllocSGPRs =
2532 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2533
2534 // Arg is preloaded into the previous SGPR.
2535 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2536 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2537 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2538 continue;
2539 }
2540
2541 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2542 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2543 // Check for free user SGPRs for preloading.
2544 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2545 SGPRInfo.getNumFreeUserSGPRs()) {
2546 InPreloadSequence = false;
2547 break;
2548 }
2549
2550 // Preload this argument.
2551 const TargetRegisterClass *RC =
2552 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2553 SmallVectorImpl<MCRegister> *PreloadRegs =
2554 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2555
2556 if (PreloadRegs->size() > 1)
2557 RC = &AMDGPU::SGPR_32RegClass;
2558 for (auto &Reg : *PreloadRegs) {
2559 assert(Reg);
2560 MF.addLiveIn(Reg, RC);
2561 CCInfo.AllocateReg(Reg);
2562 }
2563
2564 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2565 }
2566 }
2567}
2568
2570 const SIRegisterInfo &TRI,
2571 SIMachineFunctionInfo &Info) const {
2572 // Always allocate this last since it is a synthetic preload.
2573 if (Info.hasLDSKernelId()) {
2574 Register Reg = Info.addLDSKernelId();
2575 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2576 CCInfo.AllocateReg(Reg);
2577 }
2578}
2579
2580// Allocate special input registers that are initialized per-wave.
2582 MachineFunction &MF,
2584 CallingConv::ID CallConv,
2585 bool IsShader) const {
2586 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2587 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2588 // Note: user SGPRs are handled by the front-end for graphics shaders
2589 // Pad up the used user SGPRs with dead inputs.
2590
2591 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2592 // before enabling architected SGPRs for workgroup IDs.
2593 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2594
2595 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2596 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2597 // rely on it to reach 16 since if we end up having no stack usage, it will
2598 // not really be added.
2599 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2600 Info.hasWorkGroupIDY() +
2601 Info.hasWorkGroupIDZ() +
2602 Info.hasWorkGroupInfo();
2603 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2604 Register Reg = Info.addReservedUserSGPR();
2605 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2606 CCInfo.AllocateReg(Reg);
2607 }
2608 }
2609
2610 if (!HasArchitectedSGPRs) {
2611 if (Info.hasWorkGroupIDX()) {
2612 Register Reg = Info.addWorkGroupIDX();
2613 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2614 CCInfo.AllocateReg(Reg);
2615 }
2616
2617 if (Info.hasWorkGroupIDY()) {
2618 Register Reg = Info.addWorkGroupIDY();
2619 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 CCInfo.AllocateReg(Reg);
2621 }
2622
2623 if (Info.hasWorkGroupIDZ()) {
2624 Register Reg = Info.addWorkGroupIDZ();
2625 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2626 CCInfo.AllocateReg(Reg);
2627 }
2628 }
2629
2630 if (Info.hasWorkGroupInfo()) {
2631 Register Reg = Info.addWorkGroupInfo();
2632 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2633 CCInfo.AllocateReg(Reg);
2634 }
2635
2636 if (Info.hasPrivateSegmentWaveByteOffset()) {
2637 // Scratch wave offset passed in system SGPR.
2638 unsigned PrivateSegmentWaveByteOffsetReg;
2639
2640 if (IsShader) {
2641 PrivateSegmentWaveByteOffsetReg =
2642 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2643
2644 // This is true if the scratch wave byte offset doesn't have a fixed
2645 // location.
2646 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2647 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2648 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2649 }
2650 } else
2651 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2652
2653 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2654 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2655 }
2656
2657 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2658 Info.getNumPreloadedSGPRs() >= 16);
2659}
2660
2662 MachineFunction &MF,
2663 const SIRegisterInfo &TRI,
2664 SIMachineFunctionInfo &Info) {
2665 // Now that we've figured out where the scratch register inputs are, see if
2666 // should reserve the arguments and use them directly.
2667 MachineFrameInfo &MFI = MF.getFrameInfo();
2668 bool HasStackObjects = MFI.hasStackObjects();
2669 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2670
2671 // Record that we know we have non-spill stack objects so we don't need to
2672 // check all stack objects later.
2673 if (HasStackObjects)
2674 Info.setHasNonSpillStackObjects(true);
2675
2676 // Everything live out of a block is spilled with fast regalloc, so it's
2677 // almost certain that spilling will be required.
2678 if (TM.getOptLevel() == CodeGenOptLevel::None)
2679 HasStackObjects = true;
2680
2681 // For now assume stack access is needed in any callee functions, so we need
2682 // the scratch registers to pass in.
2683 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2684
2685 if (!ST.enableFlatScratch()) {
2686 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2687 // If we have stack objects, we unquestionably need the private buffer
2688 // resource. For the Code Object V2 ABI, this will be the first 4 user
2689 // SGPR inputs. We can reserve those and use them directly.
2690
2691 Register PrivateSegmentBufferReg =
2693 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2694 } else {
2695 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2696 // We tentatively reserve the last registers (skipping the last registers
2697 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2698 // we'll replace these with the ones immediately after those which were
2699 // really allocated. In the prologue copies will be inserted from the
2700 // argument to these reserved registers.
2701
2702 // Without HSA, relocations are used for the scratch pointer and the
2703 // buffer resource setup is always inserted in the prologue. Scratch wave
2704 // offset is still in an input SGPR.
2705 Info.setScratchRSrcReg(ReservedBufferReg);
2706 }
2707 }
2708
2710
2711 // For entry functions we have to set up the stack pointer if we use it,
2712 // whereas non-entry functions get this "for free". This means there is no
2713 // intrinsic advantage to using S32 over S34 in cases where we do not have
2714 // calls but do need a frame pointer (i.e. if we are requested to have one
2715 // because frame pointer elimination is disabled). To keep things simple we
2716 // only ever use S32 as the call ABI stack pointer, and so using it does not
2717 // imply we need a separate frame pointer.
2718 //
2719 // Try to use s32 as the SP, but move it if it would interfere with input
2720 // arguments. This won't work with calls though.
2721 //
2722 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2723 // registers.
2724 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2725 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2726 } else {
2728
2729 if (MFI.hasCalls())
2730 report_fatal_error("call in graphics shader with too many input SGPRs");
2731
2732 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2733 if (!MRI.isLiveIn(Reg)) {
2734 Info.setStackPtrOffsetReg(Reg);
2735 break;
2736 }
2737 }
2738
2739 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2740 report_fatal_error("failed to find register for SP");
2741 }
2742
2743 // hasFP should be accurate for entry functions even before the frame is
2744 // finalized, because it does not rely on the known stack size, only
2745 // properties like whether variable sized objects are present.
2746 if (ST.getFrameLowering()->hasFP(MF)) {
2747 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2748 }
2749}
2750
2753 return !Info->isEntryFunction();
2754}
2755
2759
2761 MachineBasicBlock *Entry,
2762 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2764
2765 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2766 if (!IStart)
2767 return;
2768
2769 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2770 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2772 for (const MCPhysReg *I = IStart; *I; ++I) {
2773 const TargetRegisterClass *RC = nullptr;
2774 if (AMDGPU::SReg_64RegClass.contains(*I))
2775 RC = &AMDGPU::SGPR_64RegClass;
2776 else if (AMDGPU::SReg_32RegClass.contains(*I))
2777 RC = &AMDGPU::SGPR_32RegClass;
2778 else
2779 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2780
2781 Register NewVR = MRI->createVirtualRegister(RC);
2782 // Create copy from CSR to a virtual register.
2783 Entry->addLiveIn(*I);
2784 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2785 .addReg(*I);
2786
2787 // Insert the copy-back instructions right before the terminator.
2788 for (auto *Exit : Exits)
2789 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2790 TII->get(TargetOpcode::COPY), *I)
2791 .addReg(NewVR);
2792 }
2793}
2794
2796 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2797 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2798 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2800
2802 const Function &Fn = MF.getFunction();
2805
2806 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2807 DiagnosticInfoUnsupported NoGraphicsHSA(
2808 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2809 DAG.getContext()->diagnose(NoGraphicsHSA);
2810 return DAG.getEntryNode();
2811 }
2812
2815 BitVector Skipped(Ins.size());
2816 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2817 *DAG.getContext());
2818
2819 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2820 bool IsKernel = AMDGPU::isKernel(CallConv);
2821 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2822
2823 if (IsGraphics) {
2824 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2825 assert(!UserSGPRInfo.hasDispatchPtr() &&
2826 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2827 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2828 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2829 (void)UserSGPRInfo;
2830 if (!Subtarget->enableFlatScratch())
2831 assert(!UserSGPRInfo.hasFlatScratchInit());
2832 if ((CallConv != CallingConv::AMDGPU_CS &&
2833 CallConv != CallingConv::AMDGPU_Gfx) ||
2834 !Subtarget->hasArchitectedSGPRs())
2835 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2836 !Info->hasWorkGroupIDZ());
2837 }
2838
2839 if (CallConv == CallingConv::AMDGPU_PS) {
2840 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2841
2842 // At least one interpolation mode must be enabled or else the GPU will
2843 // hang.
2844 //
2845 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2846 // set PSInputAddr, the user wants to enable some bits after the compilation
2847 // based on run-time states. Since we can't know what the final PSInputEna
2848 // will look like, so we shouldn't do anything here and the user should take
2849 // responsibility for the correct programming.
2850 //
2851 // Otherwise, the following restrictions apply:
2852 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2853 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2854 // enabled too.
2855 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2856 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2857 CCInfo.AllocateReg(AMDGPU::VGPR0);
2858 CCInfo.AllocateReg(AMDGPU::VGPR1);
2859 Info->markPSInputAllocated(0);
2860 Info->markPSInputEnabled(0);
2861 }
2862 if (Subtarget->isAmdPalOS()) {
2863 // For isAmdPalOS, the user does not enable some bits after compilation
2864 // based on run-time states; the register values being generated here are
2865 // the final ones set in hardware. Therefore we need to apply the
2866 // workaround to PSInputAddr and PSInputEnable together. (The case where
2867 // a bit is set in PSInputAddr but not PSInputEnable is where the
2868 // frontend set up an input arg for a particular interpolation mode, but
2869 // nothing uses that input arg. Really we should have an earlier pass
2870 // that removes such an arg.)
2871 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2872 if ((PsInputBits & 0x7F) == 0 ||
2873 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2874 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2875 }
2876 } else if (IsKernel) {
2877 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2878 } else {
2879 Splits.append(Ins.begin(), Ins.end());
2880 }
2881
2882 if (IsKernel)
2883 analyzeFormalArgumentsCompute(CCInfo, Ins);
2884
2885 if (IsEntryFunc) {
2886 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2887 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2888 if (IsKernel && Subtarget->hasKernargPreload())
2889 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2890
2891 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2892 } else if (!IsGraphics) {
2893 // For the fixed ABI, pass workitem IDs in the last argument register.
2894 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2895
2896 // FIXME: Sink this into allocateSpecialInputSGPRs
2897 if (!Subtarget->enableFlatScratch())
2898 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2899
2900 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2901 }
2902
2903 if (!IsKernel) {
2904 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2905 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2906 }
2907
2909
2910 // FIXME: This is the minimum kernel argument alignment. We should improve
2911 // this to the maximum alignment of the arguments.
2912 //
2913 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2914 // kern arg offset.
2915 const Align KernelArgBaseAlign = Align(16);
2916
2917 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2918 const ISD::InputArg &Arg = Ins[i];
2919 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2920 InVals.push_back(DAG.getUNDEF(Arg.VT));
2921 continue;
2922 }
2923
2924 CCValAssign &VA = ArgLocs[ArgIdx++];
2925 MVT VT = VA.getLocVT();
2926
2927 if (IsEntryFunc && VA.isMemLoc()) {
2928 VT = Ins[i].VT;
2929 EVT MemVT = VA.getLocVT();
2930
2931 const uint64_t Offset = VA.getLocMemOffset();
2932 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2933
2934 if (Arg.Flags.isByRef()) {
2935 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2936
2937 const GCNTargetMachine &TM =
2938 static_cast<const GCNTargetMachine &>(getTargetMachine());
2939 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2940 Arg.Flags.getPointerAddrSpace())) {
2943 }
2944
2945 InVals.push_back(Ptr);
2946 continue;
2947 }
2948
2949 SDValue NewArg;
2950 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2951 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2952 // In this case the argument is packed into the previous preload SGPR.
2953 int64_t AlignDownOffset = alignDown(Offset, 4);
2954 int64_t OffsetDiff = Offset - AlignDownOffset;
2955 EVT IntVT = MemVT.changeTypeToInteger();
2956
2957 const SIMachineFunctionInfo *Info =
2960 Register Reg =
2961 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2962
2963 assert(Reg);
2964 Register VReg = MRI.getLiveInVirtReg(Reg);
2965 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2966
2967 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2968 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2969
2970 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2971 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2972 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2973 Ins[i].Flags.isSExt(), &Ins[i]);
2974
2975 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2976 } else {
2977 const SIMachineFunctionInfo *Info =
2980 const SmallVectorImpl<MCRegister> &PreloadRegs =
2981 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2982
2983 SDValue Copy;
2984 if (PreloadRegs.size() == 1) {
2985 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2986 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2987 NewArg = DAG.getCopyFromReg(
2988 Chain, DL, VReg,
2990 TRI->getRegSizeInBits(*RC)));
2991
2992 } else {
2993 // If the kernarg alignment does not match the alignment of the SGPR
2994 // tuple RC that can accommodate this argument, it will be built up
2995 // via copies from from the individual SGPRs that the argument was
2996 // preloaded to.
2998 for (auto Reg : PreloadRegs) {
2999 Register VReg = MRI.getLiveInVirtReg(Reg);
3000 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3001 Elts.push_back(Copy);
3002 }
3003 NewArg =
3004 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3005 PreloadRegs.size()),
3006 DL, Elts);
3007 }
3008
3009 // If the argument was preloaded to multiple consecutive 32-bit
3010 // registers because of misalignment between addressable SGPR tuples
3011 // and the argument size, we can still assume that because of kernarg
3012 // segment alignment restrictions that NewArg's size is the same as
3013 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3014 // truncate since we cannot preload to less than a single SGPR and the
3015 // MemVT may be smaller.
3016 EVT MemVTInt =
3018 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3019 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3020
3021 NewArg = DAG.getBitcast(MemVT, NewArg);
3022 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3023 Ins[i].Flags.isSExt(), &Ins[i]);
3024 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3025 }
3026 } else {
3027 NewArg =
3028 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3029 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3030 }
3031 Chains.push_back(NewArg.getValue(1));
3032
3033 auto *ParamTy =
3034 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3036 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3037 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3038 // On SI local pointers are just offsets into LDS, so they are always
3039 // less than 16-bits. On CI and newer they could potentially be
3040 // real pointers, so we can't guarantee their size.
3041 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3042 DAG.getValueType(MVT::i16));
3043 }
3044
3045 InVals.push_back(NewArg);
3046 continue;
3047 }
3048 if (!IsEntryFunc && VA.isMemLoc()) {
3049 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3050 InVals.push_back(Val);
3051 if (!Arg.Flags.isByVal())
3052 Chains.push_back(Val.getValue(1));
3053 continue;
3054 }
3055
3056 assert(VA.isRegLoc() && "Parameter must be in a register!");
3057
3058 Register Reg = VA.getLocReg();
3059 const TargetRegisterClass *RC = nullptr;
3060 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3061 RC = &AMDGPU::VGPR_32RegClass;
3062 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3063 RC = &AMDGPU::SGPR_32RegClass;
3064 else
3065 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3066 EVT ValVT = VA.getValVT();
3067
3068 Reg = MF.addLiveIn(Reg, RC);
3069 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3070
3071 if (Arg.Flags.isSRet()) {
3072 // The return object should be reasonably addressable.
3073
3074 // FIXME: This helps when the return is a real sret. If it is a
3075 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3076 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3077 unsigned NumBits
3079 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3080 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3081 }
3082
3083 // If this is an 8 or 16-bit value, it is really passed promoted
3084 // to 32 bits. Insert an assert[sz]ext to capture this, then
3085 // truncate to the right size.
3086 switch (VA.getLocInfo()) {
3087 case CCValAssign::Full:
3088 break;
3089 case CCValAssign::BCvt:
3090 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3091 break;
3092 case CCValAssign::SExt:
3093 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3094 DAG.getValueType(ValVT));
3095 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3096 break;
3097 case CCValAssign::ZExt:
3098 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3099 DAG.getValueType(ValVT));
3100 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3101 break;
3102 case CCValAssign::AExt:
3103 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3104 break;
3105 default:
3106 llvm_unreachable("Unknown loc info!");
3107 }
3108
3109 InVals.push_back(Val);
3110 }
3111
3112 // Start adding system SGPRs.
3113 if (IsEntryFunc)
3114 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3115
3116 // DAG.getPass() returns nullptr when using new pass manager.
3117 // TODO: Use DAG.getMFAM() to access analysis result.
3118 if (DAG.getPass()) {
3119 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3120 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3121 }
3122
3123 unsigned StackArgSize = CCInfo.getStackSize();
3124 Info->setBytesInStackArgArea(StackArgSize);
3125
3126 return Chains.empty() ? Chain :
3127 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3128}
3129
3130// TODO: If return values can't fit in registers, we should return as many as
3131// possible in registers before passing on stack.
3133 CallingConv::ID CallConv,
3134 MachineFunction &MF, bool IsVarArg,
3136 LLVMContext &Context) const {
3137 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3138 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3139 // for shaders. Vector types should be explicitly handled by CC.
3140 if (AMDGPU::isEntryFunctionCC(CallConv))
3141 return true;
3142
3144 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3145 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3146 return false;
3147
3148 // We must use the stack if return would require unavailable registers.
3149 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3150 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3151 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3152 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3153 return false;
3154
3155 return true;
3156}
3157
3158SDValue
3159SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3160 bool isVarArg,
3162 const SmallVectorImpl<SDValue> &OutVals,
3163 const SDLoc &DL, SelectionDAG &DAG) const {
3166
3167 if (AMDGPU::isKernel(CallConv)) {
3168 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3169 OutVals, DL, DAG);
3170 }
3171
3172 bool IsShader = AMDGPU::isShader(CallConv);
3173
3174 Info->setIfReturnsVoid(Outs.empty());
3175 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3176
3177 // CCValAssign - represent the assignment of the return value to a location.
3180
3181 // CCState - Info about the registers and stack slots.
3182 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3183 *DAG.getContext());
3184
3185 // Analyze outgoing return values.
3186 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3187
3188 SDValue Glue;
3190 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3191
3192 // Copy the result values into the output registers.
3193 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3194 ++I, ++RealRVLocIdx) {
3195 CCValAssign &VA = RVLocs[I];
3196 assert(VA.isRegLoc() && "Can only return in registers!");
3197 // TODO: Partially return in registers if return values don't fit.
3198 SDValue Arg = OutVals[RealRVLocIdx];
3199
3200 // Copied from other backends.
3201 switch (VA.getLocInfo()) {
3202 case CCValAssign::Full:
3203 break;
3204 case CCValAssign::BCvt:
3205 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3206 break;
3207 case CCValAssign::SExt:
3208 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3209 break;
3210 case CCValAssign::ZExt:
3211 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3212 break;
3213 case CCValAssign::AExt:
3214 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3215 break;
3216 default:
3217 llvm_unreachable("Unknown loc info!");
3218 }
3219
3220 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3221 Glue = Chain.getValue(1);
3222 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3223 }
3224
3225 // FIXME: Does sret work properly?
3226 if (!Info->isEntryFunction()) {
3227 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3228 const MCPhysReg *I =
3229 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3230 if (I) {
3231 for (; *I; ++I) {
3232 if (AMDGPU::SReg_64RegClass.contains(*I))
3233 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3234 else if (AMDGPU::SReg_32RegClass.contains(*I))
3235 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3236 else
3237 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3238 }
3239 }
3240 }
3241
3242 // Update chain and glue.
3243 RetOps[0] = Chain;
3244 if (Glue.getNode())
3245 RetOps.push_back(Glue);
3246
3247 unsigned Opc = AMDGPUISD::ENDPGM;
3248 if (!IsWaveEnd)
3250 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3251}
3252
3254 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3255 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3256 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3257 SDValue ThisVal) const {
3258 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3259
3260 // Assign locations to each value returned by this call.
3262 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3263 *DAG.getContext());
3264 CCInfo.AnalyzeCallResult(Ins, RetCC);
3265
3266 // Copy all of the result registers out of their specified physreg.
3267 for (CCValAssign VA : RVLocs) {
3268 SDValue Val;
3269
3270 if (VA.isRegLoc()) {
3271 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3272 Chain = Val.getValue(1);
3273 InGlue = Val.getValue(2);
3274 } else if (VA.isMemLoc()) {
3275 report_fatal_error("TODO: return values in memory");
3276 } else
3277 llvm_unreachable("unknown argument location type");
3278
3279 switch (VA.getLocInfo()) {
3280 case CCValAssign::Full:
3281 break;
3282 case CCValAssign::BCvt:
3283 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3284 break;
3285 case CCValAssign::ZExt:
3286 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3287 DAG.getValueType(VA.getValVT()));
3288 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3289 break;
3290 case CCValAssign::SExt:
3291 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3292 DAG.getValueType(VA.getValVT()));
3293 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3294 break;
3295 case CCValAssign::AExt:
3296 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3297 break;
3298 default:
3299 llvm_unreachable("Unknown loc info!");
3300 }
3301
3302 InVals.push_back(Val);
3303 }
3304
3305 return Chain;
3306}
3307
3308// Add code to pass special inputs required depending on used features separate
3309// from the explicit user arguments present in the IR.
3311 CallLoweringInfo &CLI,
3312 CCState &CCInfo,
3313 const SIMachineFunctionInfo &Info,
3314 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3315 SmallVectorImpl<SDValue> &MemOpChains,
3316 SDValue Chain) const {
3317 // If we don't have a call site, this was a call inserted by
3318 // legalization. These can never use special inputs.
3319 if (!CLI.CB)
3320 return;
3321
3322 SelectionDAG &DAG = CLI.DAG;
3323 const SDLoc &DL = CLI.DL;
3324 const Function &F = DAG.getMachineFunction().getFunction();
3325
3326 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3327 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3328
3329 const AMDGPUFunctionArgInfo *CalleeArgInfo
3331 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3332 // DAG.getPass() returns nullptr when using new pass manager.
3333 // TODO: Use DAG.getMFAM() to access analysis result.
3334 if (DAG.getPass()) {
3335 auto &ArgUsageInfo =
3337 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3338 }
3339 }
3340
3341 // TODO: Unify with private memory register handling. This is complicated by
3342 // the fact that at least in kernels, the input argument is not necessarily
3343 // in the same location as the input.
3344 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3346 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3347 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3348 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3349 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3350 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3351 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3352 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3353 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3354 };
3355
3356 for (auto Attr : ImplicitAttrs) {
3357 const ArgDescriptor *OutgoingArg;
3358 const TargetRegisterClass *ArgRC;
3359 LLT ArgTy;
3360
3361 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3362
3363 // If the callee does not use the attribute value, skip copying the value.
3364 if (CLI.CB->hasFnAttr(Attr.second))
3365 continue;
3366
3367 std::tie(OutgoingArg, ArgRC, ArgTy) =
3368 CalleeArgInfo->getPreloadedValue(InputID);
3369 if (!OutgoingArg)
3370 continue;
3371
3372 const ArgDescriptor *IncomingArg;
3373 const TargetRegisterClass *IncomingArgRC;
3374 LLT Ty;
3375 std::tie(IncomingArg, IncomingArgRC, Ty) =
3376 CallerArgInfo.getPreloadedValue(InputID);
3377 assert(IncomingArgRC == ArgRC);
3378
3379 // All special arguments are ints for now.
3380 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3381 SDValue InputReg;
3382
3383 if (IncomingArg) {
3384 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3385 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3386 // The implicit arg ptr is special because it doesn't have a corresponding
3387 // input for kernels, and is computed from the kernarg segment pointer.
3388 InputReg = getImplicitArgPtr(DAG, DL);
3389 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3390 std::optional<uint32_t> Id =
3392 if (Id.has_value()) {
3393 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3394 } else {
3395 InputReg = DAG.getUNDEF(ArgVT);
3396 }
3397 } else {
3398 // We may have proven the input wasn't needed, although the ABI is
3399 // requiring it. We just need to allocate the register appropriately.
3400 InputReg = DAG.getUNDEF(ArgVT);
3401 }
3402
3403 if (OutgoingArg->isRegister()) {
3404 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3405 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3406 report_fatal_error("failed to allocate implicit input argument");
3407 } else {
3408 unsigned SpecialArgOffset =
3409 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3410 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3411 SpecialArgOffset);
3412 MemOpChains.push_back(ArgStore);
3413 }
3414 }
3415
3416 // Pack workitem IDs into a single register or pass it as is if already
3417 // packed.
3418 const ArgDescriptor *OutgoingArg;
3419 const TargetRegisterClass *ArgRC;
3420 LLT Ty;
3421
3422 std::tie(OutgoingArg, ArgRC, Ty) =
3424 if (!OutgoingArg)
3425 std::tie(OutgoingArg, ArgRC, Ty) =
3427 if (!OutgoingArg)
3428 std::tie(OutgoingArg, ArgRC, Ty) =
3430 if (!OutgoingArg)
3431 return;
3432
3433 const ArgDescriptor *IncomingArgX = std::get<0>(
3435 const ArgDescriptor *IncomingArgY = std::get<0>(
3437 const ArgDescriptor *IncomingArgZ = std::get<0>(
3439
3440 SDValue InputReg;
3441 SDLoc SL;
3442
3443 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3444 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3445 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3446
3447 // If incoming ids are not packed we need to pack them.
3448 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3449 NeedWorkItemIDX) {
3450 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3451 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3452 } else {
3453 InputReg = DAG.getConstant(0, DL, MVT::i32);
3454 }
3455 }
3456
3457 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3458 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3459 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3460 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3461 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3462 InputReg = InputReg.getNode() ?
3463 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3464 }
3465
3466 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3467 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3468 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3469 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3470 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3471 InputReg = InputReg.getNode() ?
3472 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3473 }
3474
3475 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3476 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3477 // We're in a situation where the outgoing function requires the workitem
3478 // ID, but the calling function does not have it (e.g a graphics function
3479 // calling a C calling convention function). This is illegal, but we need
3480 // to produce something.
3481 InputReg = DAG.getUNDEF(MVT::i32);
3482 } else {
3483 // Workitem ids are already packed, any of present incoming arguments
3484 // will carry all required fields.
3486 IncomingArgX ? *IncomingArgX :
3487 IncomingArgY ? *IncomingArgY :
3488 *IncomingArgZ, ~0u);
3489 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3490 }
3491 }
3492
3493 if (OutgoingArg->isRegister()) {
3494 if (InputReg)
3495 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3496
3497 CCInfo.AllocateReg(OutgoingArg->getRegister());
3498 } else {
3499 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3500 if (InputReg) {
3501 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3502 SpecialArgOffset);
3503 MemOpChains.push_back(ArgStore);
3504 }
3505 }
3506}
3507
3508static bool canGuaranteeTCO(CallingConv::ID CC) {
3509 return CC == CallingConv::Fast;
3510}
3511
3512/// Return true if we might ever do TCO for calls with this calling convention.
3513static bool mayTailCallThisCC(CallingConv::ID CC) {
3514 switch (CC) {
3515 case CallingConv::C:
3517 return true;
3518 default:
3519 return canGuaranteeTCO(CC);
3520 }
3521}
3522
3524 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3526 const SmallVectorImpl<SDValue> &OutVals,
3527 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3528 if (AMDGPU::isChainCC(CalleeCC))
3529 return true;
3530
3531 if (!mayTailCallThisCC(CalleeCC))
3532 return false;
3533
3534 // For a divergent call target, we need to do a waterfall loop over the
3535 // possible callees which precludes us from using a simple jump.
3536 if (Callee->isDivergent())
3537 return false;
3538
3540 const Function &CallerF = MF.getFunction();
3541 CallingConv::ID CallerCC = CallerF.getCallingConv();
3543 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3544
3545 // Kernels aren't callable, and don't have a live in return address so it
3546 // doesn't make sense to do a tail call with entry functions.
3547 if (!CallerPreserved)
3548 return false;
3549
3550 bool CCMatch = CallerCC == CalleeCC;
3551
3553 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3554 return true;
3555 return false;
3556 }
3557
3558 // TODO: Can we handle var args?
3559 if (IsVarArg)
3560 return false;
3561
3562 for (const Argument &Arg : CallerF.args()) {
3563 if (Arg.hasByValAttr())
3564 return false;
3565 }
3566
3567 LLVMContext &Ctx = *DAG.getContext();
3568
3569 // Check that the call results are passed in the same way.
3570 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3571 CCAssignFnForCall(CalleeCC, IsVarArg),
3572 CCAssignFnForCall(CallerCC, IsVarArg)))
3573 return false;
3574
3575 // The callee has to preserve all registers the caller needs to preserve.
3576 if (!CCMatch) {
3577 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3578 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3579 return false;
3580 }
3581
3582 // Nothing more to check if the callee is taking no arguments.
3583 if (Outs.empty())
3584 return true;
3585
3587 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3588
3589 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3590
3591 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3592 // If the stack arguments for this call do not fit into our own save area then
3593 // the call cannot be made tail.
3594 // TODO: Is this really necessary?
3595 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3596 return false;
3597
3598 const MachineRegisterInfo &MRI = MF.getRegInfo();
3599 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3600}
3601
3603 if (!CI->isTailCall())
3604 return false;
3605
3606 const Function *ParentFn = CI->getParent()->getParent();
3608 return false;
3609 return true;
3610}
3611
3612// The wave scratch offset register is used as the global base pointer.
3614 SmallVectorImpl<SDValue> &InVals) const {
3615 CallingConv::ID CallConv = CLI.CallConv;
3616 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3617
3618 SelectionDAG &DAG = CLI.DAG;
3619
3620 TargetLowering::ArgListEntry RequestedExec;
3621 if (IsChainCallConv) {
3622 // The last argument should be the value that we need to put in EXEC.
3623 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3624 // don't treat it like the rest of the arguments.
3625 RequestedExec = CLI.Args.back();
3626 assert(RequestedExec.Node && "No node for EXEC");
3627
3628 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3629 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3630
3631 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3632 CLI.Outs.pop_back();
3633 CLI.OutVals.pop_back();
3634
3635 if (RequestedExec.Ty->isIntegerTy(64)) {
3636 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3637 CLI.Outs.pop_back();
3638 CLI.OutVals.pop_back();
3639 }
3640
3641 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3642 "Haven't popped all the pieces of the EXEC mask");
3643 }
3644
3645 const SDLoc &DL = CLI.DL;
3647 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3649 SDValue Chain = CLI.Chain;
3650 SDValue Callee = CLI.Callee;
3651 bool &IsTailCall = CLI.IsTailCall;
3652 bool IsVarArg = CLI.IsVarArg;
3653 bool IsSibCall = false;
3655
3656 if (Callee.isUndef() || isNullConstant(Callee)) {
3657 if (!CLI.IsTailCall) {
3658 for (ISD::InputArg &Arg : CLI.Ins)
3659 InVals.push_back(DAG.getUNDEF(Arg.VT));
3660 }
3661
3662 return Chain;
3663 }
3664
3665 if (IsVarArg) {
3666 return lowerUnhandledCall(CLI, InVals,
3667 "unsupported call to variadic function ");
3668 }
3669
3670 if (!CLI.CB)
3671 report_fatal_error("unsupported libcall legalization");
3672
3673 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3674 return lowerUnhandledCall(CLI, InVals,
3675 "unsupported required tail call to function ");
3676 }
3677
3678 if (IsTailCall) {
3680 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3681 if (!IsTailCall &&
3682 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3683 report_fatal_error("failed to perform tail call elimination on a call "
3684 "site marked musttail or on llvm.amdgcn.cs.chain");
3685 }
3686
3687 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3688
3689 // A sibling call is one where we're under the usual C ABI and not planning
3690 // to change that but can still do a tail call:
3691 if (!TailCallOpt && IsTailCall)
3692 IsSibCall = true;
3693
3694 if (IsTailCall)
3695 ++NumTailCalls;
3696 }
3697
3700 SmallVector<SDValue, 8> MemOpChains;
3701
3702 // Analyze operands of the call, assigning locations to each operand.
3704 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3705 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3706
3707 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3708 // With a fixed ABI, allocate fixed registers before user arguments.
3709 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3710 }
3711
3712 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3713
3714 // Get a count of how many bytes are to be pushed on the stack.
3715 unsigned NumBytes = CCInfo.getStackSize();
3716
3717 if (IsSibCall) {
3718 // Since we're not changing the ABI to make this a tail call, the memory
3719 // operands are already available in the caller's incoming argument space.
3720 NumBytes = 0;
3721 }
3722
3723 // FPDiff is the byte offset of the call's argument area from the callee's.
3724 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3725 // by this amount for a tail call. In a sibling call it must be 0 because the
3726 // caller will deallocate the entire stack and the callee still expects its
3727 // arguments to begin at SP+0. Completely unused for non-tail calls.
3728 int32_t FPDiff = 0;
3729 MachineFrameInfo &MFI = MF.getFrameInfo();
3730
3731 // Adjust the stack pointer for the new arguments...
3732 // These operations are automatically eliminated by the prolog/epilog pass
3733 if (!IsSibCall)
3734 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3735
3736 if (!IsSibCall || IsChainCallConv) {
3737 if (!Subtarget->enableFlatScratch()) {
3738 SmallVector<SDValue, 4> CopyFromChains;
3739
3740 // In the HSA case, this should be an identity copy.
3741 SDValue ScratchRSrcReg
3742 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3743 RegsToPass.emplace_back(IsChainCallConv
3744 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3745 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3746 ScratchRSrcReg);
3747 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3748 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3749 }
3750 }
3751
3752 MVT PtrVT = MVT::i32;
3753
3754 // Walk the register/memloc assignments, inserting copies/loads.
3755 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3756 CCValAssign &VA = ArgLocs[i];
3757 SDValue Arg = OutVals[i];
3758
3759 // Promote the value if needed.
3760 switch (VA.getLocInfo()) {
3761 case CCValAssign::Full:
3762 break;
3763 case CCValAssign::BCvt:
3764 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3765 break;
3766 case CCValAssign::ZExt:
3767 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3768 break;
3769 case CCValAssign::SExt:
3770 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3771 break;
3772 case CCValAssign::AExt:
3773 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3774 break;
3775 case CCValAssign::FPExt:
3776 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3777 break;
3778 default:
3779 llvm_unreachable("Unknown loc info!");
3780 }
3781
3782 if (VA.isRegLoc()) {
3783 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3784 } else {
3785 assert(VA.isMemLoc());
3786
3787 SDValue DstAddr;
3788 MachinePointerInfo DstInfo;
3789
3790 unsigned LocMemOffset = VA.getLocMemOffset();
3791 int32_t Offset = LocMemOffset;
3792
3793 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3794 MaybeAlign Alignment;
3795
3796 if (IsTailCall) {
3797 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3798 unsigned OpSize = Flags.isByVal() ?
3799 Flags.getByValSize() : VA.getValVT().getStoreSize();
3800
3801 // FIXME: We can have better than the minimum byval required alignment.
3802 Alignment =
3803 Flags.isByVal()
3804 ? Flags.getNonZeroByValAlign()
3805 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3806
3807 Offset = Offset + FPDiff;
3808 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3809
3810 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3811 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3812
3813 // Make sure any stack arguments overlapping with where we're storing
3814 // are loaded before this eventual operation. Otherwise they'll be
3815 // clobbered.
3816
3817 // FIXME: Why is this really necessary? This seems to just result in a
3818 // lot of code to copy the stack and write them back to the same
3819 // locations, which are supposed to be immutable?
3820 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3821 } else {
3822 // Stores to the argument stack area are relative to the stack pointer.
3823 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3824 MVT::i32);
3825 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3826 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3827 Alignment =
3828 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3829 }
3830
3831 if (Outs[i].Flags.isByVal()) {
3832 SDValue SizeNode =
3833 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3834 SDValue Cpy =
3835 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3836 Outs[i].Flags.getNonZeroByValAlign(),
3837 /*isVol = */ false, /*AlwaysInline = */ true,
3838 /*CI=*/nullptr, std::nullopt, DstInfo,
3840
3841 MemOpChains.push_back(Cpy);
3842 } else {
3843 SDValue Store =
3844 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3845 MemOpChains.push_back(Store);
3846 }
3847 }
3848 }
3849
3850 if (!MemOpChains.empty())
3851 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3852
3853 // Build a sequence of copy-to-reg nodes chained together with token chain
3854 // and flag operands which copy the outgoing args into the appropriate regs.
3855 SDValue InGlue;
3856 for (auto &RegToPass : RegsToPass) {
3857 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3858 RegToPass.second, InGlue);
3859 InGlue = Chain.getValue(1);
3860 }
3861
3862
3863 // We don't usually want to end the call-sequence here because we would tidy
3864 // the frame up *after* the call, however in the ABI-changing tail-call case
3865 // we've carefully laid out the parameters so that when sp is reset they'll be
3866 // in the correct location.
3867 if (IsTailCall && !IsSibCall) {
3868 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3869 InGlue = Chain.getValue(1);
3870 }
3871
3872 std::vector<SDValue> Ops;
3873 Ops.push_back(Chain);
3874 Ops.push_back(Callee);
3875 // Add a redundant copy of the callee global which will not be legalized, as
3876 // we need direct access to the callee later.
3878 const GlobalValue *GV = GSD->getGlobal();
3879 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3880 } else {
3881 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3882 }
3883
3884 if (IsTailCall) {
3885 // Each tail call may have to adjust the stack by a different amount, so
3886 // this information must travel along with the operation for eventual
3887 // consumption by emitEpilogue.
3888 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3889 }
3890
3891 if (IsChainCallConv)
3892 Ops.push_back(RequestedExec.Node);
3893
3894 // Add argument registers to the end of the list so that they are known live
3895 // into the call.
3896 for (auto &RegToPass : RegsToPass) {
3897 Ops.push_back(DAG.getRegister(RegToPass.first,
3898 RegToPass.second.getValueType()));
3899 }
3900
3901 // Add a register mask operand representing the call-preserved registers.
3902 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3903 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3904 assert(Mask && "Missing call preserved mask for calling convention");
3905 Ops.push_back(DAG.getRegisterMask(Mask));
3906
3907 if (SDValue Token = CLI.ConvergenceControlToken) {
3909 GlueOps.push_back(Token);
3910 if (InGlue)
3911 GlueOps.push_back(InGlue);
3912
3913 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3914 MVT::Glue, GlueOps),
3915 0);
3916 }
3917
3918 if (InGlue)
3919 Ops.push_back(InGlue);
3920
3921 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3922
3923 // If we're doing a tall call, use a TC_RETURN here rather than an
3924 // actual call instruction.
3925 if (IsTailCall) {
3926 MFI.setHasTailCall();
3927 unsigned OPC = AMDGPUISD::TC_RETURN;
3928 switch (CallConv) {
3931 break;
3935 break;
3936 }
3937
3938 return DAG.getNode(OPC, DL, NodeTys, Ops);
3939 }
3940
3941 // Returns a chain and a flag for retval copy to use.
3942 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3943 Chain = Call.getValue(0);
3944 InGlue = Call.getValue(1);
3945
3946 uint64_t CalleePopBytes = NumBytes;
3947 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3948 if (!Ins.empty())
3949 InGlue = Chain.getValue(1);
3950
3951 // Handle result values, copying them out of physregs into vregs that we
3952 // return.
3953 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3954 InVals, /*IsThisReturn=*/false, SDValue());
3955}
3956
3957// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3958// except for applying the wave size scale to the increment amount.
3960 SDValue Op, SelectionDAG &DAG) const {
3961 const MachineFunction &MF = DAG.getMachineFunction();
3963
3964 SDLoc dl(Op);
3965 EVT VT = Op.getValueType();
3966 SDValue Tmp1 = Op;
3967 SDValue Tmp2 = Op.getValue(1);
3968 SDValue Tmp3 = Op.getOperand(2);
3969 SDValue Chain = Tmp1.getOperand(0);
3970
3971 Register SPReg = Info->getStackPtrOffsetReg();
3972
3973 // Chain the dynamic stack allocation so that it doesn't modify the stack
3974 // pointer when other instructions are using the stack.
3975 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3976
3977 SDValue Size = Tmp2.getOperand(1);
3978 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3979 Chain = SP.getValue(1);
3980 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3981 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3982 unsigned Opc =
3985
3986 SDValue ScaledSize = DAG.getNode(
3987 ISD::SHL, dl, VT, Size,
3988 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3989
3990 Align StackAlign = TFL->getStackAlign();
3991 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3992 if (Alignment && *Alignment > StackAlign) {
3993 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3994 DAG.getConstant(-(uint64_t)Alignment->value()
3995 << Subtarget->getWavefrontSizeLog2(),
3996 dl, VT));
3997 }
3998
3999 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
4000 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4001
4002 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
4003}
4004
4006 SelectionDAG &DAG) const {
4007 // We only handle constant sizes here to allow non-entry block, static sized
4008 // allocas. A truly dynamic value is more difficult to support because we
4009 // don't know if the size value is uniform or not. If the size isn't uniform,
4010 // we would need to do a wave reduction to get the maximum size to know how
4011 // much to increment the uniform stack pointer.
4012 SDValue Size = Op.getOperand(1);
4014 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4015
4017}
4018
4020 if (Op.getValueType() != MVT::i32)
4021 return Op; // Defer to cannot select error.
4022
4024 SDLoc SL(Op);
4025
4026 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4027
4028 // Convert from wave uniform to swizzled vector address. This should protect
4029 // from any edge cases where the stacksave result isn't directly used with
4030 // stackrestore.
4031 SDValue VectorAddress =
4032 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4033 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4034}
4035
4037 SelectionDAG &DAG) const {
4038 SDLoc SL(Op);
4039 assert(Op.getValueType() == MVT::i32);
4040
4041 uint32_t BothRoundHwReg =
4043 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4044
4045 SDValue IntrinID =
4046 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4047 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4048 Op.getOperand(0), IntrinID, GetRoundBothImm);
4049
4050 // There are two rounding modes, one for f32 and one for f64/f16. We only
4051 // report in the standard value range if both are the same.
4052 //
4053 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4054 // ties away from zero is not supported, and the other values are rotated by
4055 // 1.
4056 //
4057 // If the two rounding modes are not the same, report a target defined value.
4058
4059 // Mode register rounding mode fields:
4060 //
4061 // [1:0] Single-precision round mode.
4062 // [3:2] Double/Half-precision round mode.
4063 //
4064 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4065 //
4066 // Hardware Spec
4067 // Toward-0 3 0
4068 // Nearest Even 0 1
4069 // +Inf 1 2
4070 // -Inf 2 3
4071 // NearestAway0 N/A 4
4072 //
4073 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4074 // table we can index by the raw hardware mode.
4075 //
4076 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4077
4078 SDValue BitTable =
4080
4081 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4082 SDValue RoundModeTimesNumBits =
4083 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4084
4085 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4086 // knew only one mode was demanded.
4087 SDValue TableValue =
4088 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4089 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4090
4091 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4092 SDValue TableEntry =
4093 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4094
4095 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4096 // if it's an extended value.
4097 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4098 SDValue IsStandardValue =
4099 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4100 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4101 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4102 TableEntry, EnumOffset);
4103
4104 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4105}
4106
4108 SelectionDAG &DAG) const {
4109 SDLoc SL(Op);
4110
4111 SDValue NewMode = Op.getOperand(1);
4112 assert(NewMode.getValueType() == MVT::i32);
4113
4114 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4115 // hardware MODE.fp_round values.
4116 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4117 uint32_t ClampedVal = std::min(
4118 static_cast<uint32_t>(ConstMode->getZExtValue()),
4120 NewMode = DAG.getConstant(
4121 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4122 } else {
4123 // If we know the input can only be one of the supported standard modes in
4124 // the range 0-3, we can use a simplified mapping to hardware values.
4125 KnownBits KB = DAG.computeKnownBits(NewMode);
4126 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4127 // The supported standard values are 0-3. The extended values start at 8. We
4128 // need to offset by 4 if the value is in the extended range.
4129
4130 if (UseReducedTable) {
4131 // Truncate to the low 32-bits.
4132 SDValue BitTable = DAG.getConstant(
4133 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4134
4135 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4136 SDValue RoundModeTimesNumBits =
4137 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4138
4139 NewMode =
4140 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4141
4142 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4143 // the table extracted bits into inline immediates.
4144 } else {
4145 // table_index = umin(value, value - 4)
4146 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4147 SDValue BitTable =
4149
4150 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4151 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4152 SDValue IndexVal =
4153 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4154
4155 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4156 SDValue RoundModeTimesNumBits =
4157 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4158
4159 SDValue TableValue =
4160 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4161 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4162
4163 // No need to mask out the high bits since the setreg will ignore them
4164 // anyway.
4165 NewMode = TruncTable;
4166 }
4167
4168 // Insert a readfirstlane in case the value is a VGPR. We could do this
4169 // earlier and keep more operations scalar, but that interferes with
4170 // combining the source.
4171 SDValue ReadFirstLaneID =
4172 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4173 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4174 ReadFirstLaneID, NewMode);
4175 }
4176
4177 // N.B. The setreg will be later folded into s_round_mode on supported
4178 // targets.
4179 SDValue IntrinID =
4180 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4181 uint32_t BothRoundHwReg =
4183 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4184
4185 SDValue SetReg =
4186 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4187 IntrinID, RoundBothImm, NewMode);
4188
4189 return SetReg;
4190}
4191
4193 if (Op->isDivergent())
4194 return SDValue();
4195
4196 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4201 break;
4202 default:
4203 return SDValue();
4204 }
4205
4206 return Op;
4207}
4208
4209// Work around DAG legality rules only based on the result type.
4211 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4212 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4213 EVT SrcVT = Src.getValueType();
4214
4215 if (SrcVT.getScalarType() != MVT::bf16)
4216 return Op;
4217
4218 SDLoc SL(Op);
4219 SDValue BitCast =
4220 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4221
4222 EVT DstVT = Op.getValueType();
4223 if (IsStrict)
4224 llvm_unreachable("Need STRICT_BF16_TO_FP");
4225
4226 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4227}
4228
4230 SDLoc SL(Op);
4231 if (Op.getValueType() != MVT::i64)
4232 return Op;
4233
4234 uint32_t ModeHwReg =
4236 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4237 uint32_t TrapHwReg =
4239 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4240
4241 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4242 SDValue IntrinID =
4243 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4244 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4245 Op.getOperand(0), IntrinID, ModeHwRegImm);
4246 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4247 Op.getOperand(0), IntrinID, TrapHwRegImm);
4248 SDValue TokenReg =
4249 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4250 GetTrapReg.getValue(1));
4251
4252 SDValue CvtPtr =
4253 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4254 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4255
4256 return DAG.getMergeValues({Result, TokenReg}, SL);
4257}
4258
4260 SDLoc SL(Op);
4261 if (Op.getOperand(1).getValueType() != MVT::i64)
4262 return Op;
4263
4264 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4265 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4266 DAG.getConstant(0, SL, MVT::i32));
4267 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4268 DAG.getConstant(1, SL, MVT::i32));
4269
4270 SDValue ReadFirstLaneID =
4271 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4272 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4273 ReadFirstLaneID, NewModeReg);
4274 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4275 ReadFirstLaneID, NewTrapReg);
4276
4277 unsigned ModeHwReg =
4279 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4280 unsigned TrapHwReg =
4282 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4283
4284 SDValue IntrinID =
4285 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4286 SDValue SetModeReg =
4287 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4288 IntrinID, ModeHwRegImm, NewModeReg);
4289 SDValue SetTrapReg =
4290 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4291 IntrinID, TrapHwRegImm, NewTrapReg);
4292 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4293}
4294
4296 const MachineFunction &MF) const {
4298 .Case("m0", AMDGPU::M0)
4299 .Case("exec", AMDGPU::EXEC)
4300 .Case("exec_lo", AMDGPU::EXEC_LO)
4301 .Case("exec_hi", AMDGPU::EXEC_HI)
4302 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4303 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4304 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4305 .Default(Register());
4306
4307 if (Reg == AMDGPU::NoRegister) {
4308 report_fatal_error(Twine("invalid register name \""
4309 + StringRef(RegName) + "\"."));
4310
4311 }
4312
4313 if (!Subtarget->hasFlatScrRegister() &&
4314 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4315 report_fatal_error(Twine("invalid register \""
4316 + StringRef(RegName) + "\" for subtarget."));
4317 }
4318
4319 switch (Reg) {
4320 case AMDGPU::M0:
4321 case AMDGPU::EXEC_LO:
4322 case AMDGPU::EXEC_HI:
4323 case AMDGPU::FLAT_SCR_LO:
4324 case AMDGPU::FLAT_SCR_HI:
4325 if (VT.getSizeInBits() == 32)
4326 return Reg;
4327 break;
4328 case AMDGPU::EXEC:
4329 case AMDGPU::FLAT_SCR:
4330 if (VT.getSizeInBits() == 64)
4331 return Reg;
4332 break;
4333 default:
4334 llvm_unreachable("missing register type checking");
4335 }
4336
4337 report_fatal_error(Twine("invalid type for register \""
4338 + StringRef(RegName) + "\"."));
4339}
4340
4341// If kill is not the last instruction, split the block so kill is always a
4342// proper terminator.
4345 MachineBasicBlock *BB) const {
4346 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4348 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4349 return SplitBB;
4350}
4351
4352// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4353// \p MI will be the only instruction in the loop body block. Otherwise, it will
4354// be the first instruction in the remainder block.
4355//
4356/// \returns { LoopBody, Remainder }
4357static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4361
4362 // To insert the loop we need to split the block. Move everything after this
4363 // point to a new block, and insert a new empty block between the two.
4365 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4367 ++MBBI;
4368
4369 MF->insert(MBBI, LoopBB);
4370 MF->insert(MBBI, RemainderBB);
4371
4372 LoopBB->addSuccessor(LoopBB);
4373 LoopBB->addSuccessor(RemainderBB);
4374
4375 // Move the rest of the block into a new block.
4376 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4377
4378 if (InstInLoop) {
4379 auto Next = std::next(I);
4380
4381 // Move instruction to loop body.
4382 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4383
4384 // Move the rest of the block.
4385 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4386 } else {
4387 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4388 }
4389
4390 MBB.addSuccessor(LoopBB);
4391
4392 return std::pair(LoopBB, RemainderBB);
4393}
4394
4395/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4397 MachineBasicBlock *MBB = MI.getParent();
4399 auto I = MI.getIterator();
4400 auto E = std::next(I);
4401
4402 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4403 .addImm(0);
4404
4405 MIBundleBuilder Bundler(*MBB, I, E);
4406 finalizeBundle(*MBB, Bundler.begin());
4407}
4408
4411 MachineBasicBlock *BB) const {
4412 const DebugLoc &DL = MI.getDebugLoc();
4413
4415
4416 MachineBasicBlock *LoopBB;
4417 MachineBasicBlock *RemainderBB;
4419
4420 // Apparently kill flags are only valid if the def is in the same block?
4421 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4422 Src->setIsKill(false);
4423
4424 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4425
4426 MachineBasicBlock::iterator I = LoopBB->end();
4427
4428 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4430
4431 // Clear TRAP_STS.MEM_VIOL
4432 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4433 .addImm(0)
4434 .addImm(EncodedReg);
4435
4437
4438 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4439
4440 // Load and check TRAP_STS.MEM_VIOL
4441 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4442 .addImm(EncodedReg);
4443
4444 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4445 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4446 .addReg(Reg, RegState::Kill)
4447 .addImm(0);
4448 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4449 .addMBB(LoopBB);
4450
4451 return RemainderBB;
4452}
4453
4454// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4455// wavefront. If the value is uniform and just happens to be in a VGPR, this
4456// will only do one iteration. In the worst case, this will loop 64 times.
4457//
4458// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4461 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4462 const DebugLoc &DL, const MachineOperand &Idx,
4463 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4464 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4465 Register &SGPRIdxReg) {
4466
4467 MachineFunction *MF = OrigBB.getParent();
4468 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4469 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4471
4472 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4473 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4474 Register NewExec = MRI.createVirtualRegister(BoolRC);
4475 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4476 Register CondReg = MRI.createVirtualRegister(BoolRC);
4477
4478 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4479 .addReg(InitReg)
4480 .addMBB(&OrigBB)
4481 .addReg(ResultReg)
4482 .addMBB(&LoopBB);
4483
4484 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4485 .addReg(InitSaveExecReg)
4486 .addMBB(&OrigBB)
4487 .addReg(NewExec)
4488 .addMBB(&LoopBB);
4489
4490 // Read the next variant <- also loop target.
4491 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4492 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4493
4494 // Compare the just read M0 value to all possible Idx values.
4495 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4496 .addReg(CurrentIdxReg)
4497 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4498
4499 // Update EXEC, save the original EXEC value to VCC.
4500 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4501 : AMDGPU::S_AND_SAVEEXEC_B64),
4502 NewExec)
4503 .addReg(CondReg, RegState::Kill);
4504
4505 MRI.setSimpleHint(NewExec, CondReg);
4506
4507 if (UseGPRIdxMode) {
4508 if (Offset == 0) {
4509 SGPRIdxReg = CurrentIdxReg;
4510 } else {
4511 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4512 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4513 .addReg(CurrentIdxReg, RegState::Kill)
4514 .addImm(Offset);
4515 }
4516 } else {
4517 // Move index from VCC into M0
4518 if (Offset == 0) {
4519 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4520 .addReg(CurrentIdxReg, RegState::Kill);
4521 } else {
4522 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4523 .addReg(CurrentIdxReg, RegState::Kill)
4524 .addImm(Offset);
4525 }
4526 }
4527
4528 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4529 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4530 MachineInstr *InsertPt =
4531 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4532 : AMDGPU::S_XOR_B64_term), Exec)
4533 .addReg(Exec)
4534 .addReg(NewExec);
4535
4536 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4537 // s_cbranch_scc0?
4538
4539 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4540 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4541 .addMBB(&LoopBB);
4542
4543 return InsertPt->getIterator();
4544}
4545
4546// This has slightly sub-optimal regalloc when the source vector is killed by
4547// the read. The register allocator does not understand that the kill is
4548// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4549// subregister from it, using 1 more VGPR than necessary. This was saved when
4550// this was expanded after register allocation.
4553 unsigned InitResultReg, unsigned PhiReg, int Offset,
4554 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4556 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4557 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4559 const DebugLoc &DL = MI.getDebugLoc();
4561
4562 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4563 Register DstReg = MI.getOperand(0).getReg();
4564 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4565 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4566 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4567 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4568
4569 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4570
4571 // Save the EXEC mask
4572 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4573 .addReg(Exec);
4574
4575 MachineBasicBlock *LoopBB;
4576 MachineBasicBlock *RemainderBB;
4577 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4578
4579 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4580
4581 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4582 InitResultReg, DstReg, PhiReg, TmpExec,
4583 Offset, UseGPRIdxMode, SGPRIdxReg);
4584
4585 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4587 ++MBBI;
4588 MF->insert(MBBI, LandingPad);
4589 LoopBB->removeSuccessor(RemainderBB);
4590 LandingPad->addSuccessor(RemainderBB);
4591 LoopBB->addSuccessor(LandingPad);
4592 MachineBasicBlock::iterator First = LandingPad->begin();
4593 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4594 .addReg(SaveExec);
4595
4596 return InsPt;
4597}
4598
4599// Returns subreg index, offset
4600static std::pair<unsigned, int>
4602 const TargetRegisterClass *SuperRC,
4603 unsigned VecReg,
4604 int Offset) {
4605 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4606
4607 // Skip out of bounds offsets, or else we would end up using an undefined
4608 // register.
4609 if (Offset >= NumElts || Offset < 0)
4610 return std::pair(AMDGPU::sub0, Offset);
4611
4612 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4613}
4614
4617 int Offset) {
4618 MachineBasicBlock *MBB = MI.getParent();
4619 const DebugLoc &DL = MI.getDebugLoc();
4621
4622 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4623
4624 assert(Idx->getReg() != AMDGPU::NoRegister);
4625
4626 if (Offset == 0) {
4627 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4628 } else {
4629 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4630 .add(*Idx)
4631 .addImm(Offset);
4632 }
4633}
4634
4637 int Offset) {
4638 MachineBasicBlock *MBB = MI.getParent();
4639 const DebugLoc &DL = MI.getDebugLoc();
4641
4642 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4643
4644 if (Offset == 0)
4645 return Idx->getReg();
4646
4647 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4648 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4649 .add(*Idx)
4650 .addImm(Offset);
4651 return Tmp;
4652}
4653
4656 const GCNSubtarget &ST) {
4657 const SIInstrInfo *TII = ST.getInstrInfo();
4658 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4661
4662 Register Dst = MI.getOperand(0).getReg();
4663 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4664 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4665 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4666
4667 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4668 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4669
4670 unsigned SubReg;
4671 std::tie(SubReg, Offset)
4672 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4673
4674 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4675
4676 // Check for a SGPR index.
4677 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4679 const DebugLoc &DL = MI.getDebugLoc();
4680
4681 if (UseGPRIdxMode) {
4682 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4683 // to avoid interfering with other uses, so probably requires a new
4684 // optimization pass.
4686
4687 const MCInstrDesc &GPRIDXDesc =
4688 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4689 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4690 .addReg(SrcReg)
4691 .addReg(Idx)
4692 .addImm(SubReg);
4693 } else {
4695
4696 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4697 .addReg(SrcReg, 0, SubReg)
4698 .addReg(SrcReg, RegState::Implicit);
4699 }
4700
4701 MI.eraseFromParent();
4702
4703 return &MBB;
4704 }
4705
4706 // Control flow needs to be inserted if indexing with a VGPR.
4707 const DebugLoc &DL = MI.getDebugLoc();
4709
4710 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4711 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4712
4713 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4714
4715 Register SGPRIdxReg;
4716 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4717 UseGPRIdxMode, SGPRIdxReg);
4718
4719 MachineBasicBlock *LoopBB = InsPt->getParent();
4720
4721 if (UseGPRIdxMode) {
4722 const MCInstrDesc &GPRIDXDesc =
4723 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4724
4725 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4726 .addReg(SrcReg)
4727 .addReg(SGPRIdxReg)
4728 .addImm(SubReg);
4729 } else {
4730 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4731 .addReg(SrcReg, 0, SubReg)
4732 .addReg(SrcReg, RegState::Implicit);
4733 }
4734
4735 MI.eraseFromParent();
4736
4737 return LoopBB;
4738}
4739
4742 const GCNSubtarget &ST) {
4743 const SIInstrInfo *TII = ST.getInstrInfo();
4744 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4747
4748 Register Dst = MI.getOperand(0).getReg();
4749 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4750 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4751 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4752 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4753 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4754 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4755
4756 // This can be an immediate, but will be folded later.
4757 assert(Val->getReg());
4758
4759 unsigned SubReg;
4760 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4761 SrcVec->getReg(),
4762 Offset);
4763 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4764
4765 if (Idx->getReg() == AMDGPU::NoRegister) {
4767 const DebugLoc &DL = MI.getDebugLoc();
4768
4769 assert(Offset == 0);
4770
4771 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4772 .add(*SrcVec)
4773 .add(*Val)
4774 .addImm(SubReg);
4775
4776 MI.eraseFromParent();
4777 return &MBB;
4778 }
4779
4780 // Check for a SGPR index.
4781 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4783 const DebugLoc &DL = MI.getDebugLoc();
4784
4785 if (UseGPRIdxMode) {
4787
4788 const MCInstrDesc &GPRIDXDesc =
4789 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4790 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4791 .addReg(SrcVec->getReg())
4792 .add(*Val)
4793 .addReg(Idx)
4794 .addImm(SubReg);
4795 } else {
4797
4798 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4799 TRI.getRegSizeInBits(*VecRC), 32, false);
4800 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4801 .addReg(SrcVec->getReg())
4802 .add(*Val)
4803 .addImm(SubReg);
4804 }
4805 MI.eraseFromParent();
4806 return &MBB;
4807 }
4808
4809 // Control flow needs to be inserted if indexing with a VGPR.
4810 if (Val->isReg())
4811 MRI.clearKillFlags(Val->getReg());
4812
4813 const DebugLoc &DL = MI.getDebugLoc();
4814
4815 Register PhiReg = MRI.createVirtualRegister(VecRC);
4816
4817 Register SGPRIdxReg;
4818 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4819 UseGPRIdxMode, SGPRIdxReg);
4820 MachineBasicBlock *LoopBB = InsPt->getParent();
4821
4822 if (UseGPRIdxMode) {
4823 const MCInstrDesc &GPRIDXDesc =
4824 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4825
4826 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4827 .addReg(PhiReg)
4828 .add(*Val)
4829 .addReg(SGPRIdxReg)
4830 .addImm(SubReg);
4831 } else {
4832 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4833 TRI.getRegSizeInBits(*VecRC), 32, false);
4834 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4835 .addReg(PhiReg)
4836 .add(*Val)
4837 .addImm(SubReg);
4838 }
4839
4840 MI.eraseFromParent();
4841 return LoopBB;
4842}
4843
4846 const GCNSubtarget &ST,
4847 unsigned Opc) {
4849 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4850 const DebugLoc &DL = MI.getDebugLoc();
4851 const SIInstrInfo *TII = ST.getInstrInfo();
4852
4853 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4854 Register SrcReg = MI.getOperand(1).getReg();
4855 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4856 Register DstReg = MI.getOperand(0).getReg();
4857 MachineBasicBlock *RetBB = nullptr;
4858 if (isSGPR) {
4859 // These operations with a uniform value i.e. SGPR are idempotent.
4860 // Reduced value will be same as given sgpr.
4861 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4862 RetBB = &BB;
4863 } else {
4864 // TODO: Implement DPP Strategy and switch based on immediate strategy
4865 // operand. For now, for all the cases (default, Iterative and DPP we use
4866 // iterative approach by default.)
4867
4868 // To reduce the VGPR using iterative approach, we need to iterate
4869 // over all the active lanes. Lowering consists of ComputeLoop,
4870 // which iterate over only active lanes. We use copy of EXEC register
4871 // as induction variable and every active lane modifies it using bitset0
4872 // so that we will get the next active lane for next iteration.
4874 Register SrcReg = MI.getOperand(1).getReg();
4875
4876 // Create Control flow for loop
4877 // Split MI's Machine Basic block into For loop
4878 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4879
4880 // Create virtual registers required for lowering.
4881 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4882 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4883 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4884 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4885
4886 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4887 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4888 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4889
4890 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4891 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4892
4893 bool IsWave32 = ST.isWave32();
4894 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4895 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4896
4897 // Create initail values of induction variable from Exec, Accumulator and
4898 // insert branch instr to newly created ComputeBlockk
4899 uint32_t InitalValue =
4900 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4901 auto TmpSReg =
4902 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4903 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4904 .addImm(InitalValue);
4905 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4906
4907 // Start constructing ComputeLoop
4908 I = ComputeLoop->end();
4909 auto Accumulator =
4910 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4911 .addReg(InitalValReg)
4912 .addMBB(&BB);
4913 auto ActiveBits =
4914 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4915 .addReg(TmpSReg->getOperand(0).getReg())
4916 .addMBB(&BB);
4917
4918 // Perform the computations
4919 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4920 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4921 .addReg(ActiveBits->getOperand(0).getReg());
4922 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4923 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4924 .addReg(SrcReg)
4925 .addReg(FF1->getOperand(0).getReg());
4926 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4927 .addReg(Accumulator->getOperand(0).getReg())
4928 .addReg(LaneValue->getOperand(0).getReg());
4929
4930 // Manipulate the iterator to get the next active lane
4931 unsigned BITSETOpc =
4932 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4933 auto NewActiveBits =
4934 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4935 .addReg(FF1->getOperand(0).getReg())
4936 .addReg(ActiveBits->getOperand(0).getReg());
4937
4938 // Add phi nodes
4939 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4940 .addMBB(ComputeLoop);
4941 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4942 .addMBB(ComputeLoop);
4943
4944 // Creating branching
4945 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4946 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4947 .addReg(NewActiveBits->getOperand(0).getReg())
4948 .addImm(0);
4949 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4950 .addMBB(ComputeLoop);
4951
4952 RetBB = ComputeEnd;
4953 }
4954 MI.eraseFromParent();
4955 return RetBB;
4956}
4957
4959 MachineInstr &MI, MachineBasicBlock *BB) const {
4960
4962 MachineFunction *MF = BB->getParent();
4964
4965 switch (MI.getOpcode()) {
4966 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4967 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4968 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4969 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4970 case AMDGPU::S_UADDO_PSEUDO:
4971 case AMDGPU::S_USUBO_PSEUDO: {
4972 const DebugLoc &DL = MI.getDebugLoc();
4973 MachineOperand &Dest0 = MI.getOperand(0);
4974 MachineOperand &Dest1 = MI.getOperand(1);
4975 MachineOperand &Src0 = MI.getOperand(2);
4976 MachineOperand &Src1 = MI.getOperand(3);
4977
4978 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4979 ? AMDGPU::S_ADD_I32
4980 : AMDGPU::S_SUB_I32;
4981 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4982
4983 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4984 .addImm(1)
4985 .addImm(0);
4986
4987 MI.eraseFromParent();
4988 return BB;
4989 }
4990 case AMDGPU::S_ADD_U64_PSEUDO:
4991 case AMDGPU::S_SUB_U64_PSEUDO: {
4992 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4993 // For GFX12, we emit s_add_u64 and s_sub_u64.
4994 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4996 const DebugLoc &DL = MI.getDebugLoc();
4997 MachineOperand &Dest = MI.getOperand(0);
4998 MachineOperand &Src0 = MI.getOperand(1);
4999 MachineOperand &Src1 = MI.getOperand(2);
5000 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5001 if (Subtarget->hasScalarAddSub64()) {
5002 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5003 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5004 .add(Src0)
5005 .add(Src1);
5006 } else {
5007 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5008 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5009
5010 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5011 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5012
5013 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5014 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5015 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5016 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5017
5018 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5019 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5020 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5021 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5022
5023 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5024 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5025 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5026 .add(Src0Sub0)
5027 .add(Src1Sub0);
5028 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5029 .add(Src0Sub1)
5030 .add(Src1Sub1);
5031 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5032 .addReg(DestSub0)
5033 .addImm(AMDGPU::sub0)
5034 .addReg(DestSub1)
5035 .addImm(AMDGPU::sub1);
5036 }
5037 MI.eraseFromParent();
5038 return BB;
5039 }
5040 case AMDGPU::V_ADD_U64_PSEUDO:
5041 case AMDGPU::V_SUB_U64_PSEUDO: {
5043 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5044 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5045 const DebugLoc &DL = MI.getDebugLoc();
5046
5047 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5048
5049 MachineOperand &Dest = MI.getOperand(0);
5050 MachineOperand &Src0 = MI.getOperand(1);
5051 MachineOperand &Src1 = MI.getOperand(2);
5052
5053 if (IsAdd && ST.hasLshlAddB64()) {
5054 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5055 Dest.getReg())
5056 .add(Src0)
5057 .addImm(0)
5058 .add(Src1);
5059 TII->legalizeOperands(*Add);
5060 MI.eraseFromParent();
5061 return BB;
5062 }
5063
5064 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5065
5066 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5067 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5068
5069 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5070 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5071
5072 const TargetRegisterClass *Src0RC = Src0.isReg()
5073 ? MRI.getRegClass(Src0.getReg())
5074 : &AMDGPU::VReg_64RegClass;
5075 const TargetRegisterClass *Src1RC = Src1.isReg()
5076 ? MRI.getRegClass(Src1.getReg())
5077 : &AMDGPU::VReg_64RegClass;
5078
5079 const TargetRegisterClass *Src0SubRC =
5080 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5081 const TargetRegisterClass *Src1SubRC =
5082 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5083
5084 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5085 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5086 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5087 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5088
5089 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5090 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5091 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5092 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5093
5094 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5095 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5096 .addReg(CarryReg, RegState::Define)
5097 .add(SrcReg0Sub0)
5098 .add(SrcReg1Sub0)
5099 .addImm(0); // clamp bit
5100
5101 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5102 MachineInstr *HiHalf =
5103 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5104 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5105 .add(SrcReg0Sub1)
5106 .add(SrcReg1Sub1)
5107 .addReg(CarryReg, RegState::Kill)
5108 .addImm(0); // clamp bit
5109
5110 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5111 .addReg(DestSub0)
5112 .addImm(AMDGPU::sub0)
5113 .addReg(DestSub1)
5114 .addImm(AMDGPU::sub1);
5115 TII->legalizeOperands(*LoHalf);
5116 TII->legalizeOperands(*HiHalf);
5117 MI.eraseFromParent();
5118 return BB;
5119 }
5120 case AMDGPU::S_ADD_CO_PSEUDO:
5121 case AMDGPU::S_SUB_CO_PSEUDO: {
5122 // This pseudo has a chance to be selected
5123 // only from uniform add/subcarry node. All the VGPR operands
5124 // therefore assumed to be splat vectors.
5126 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5127 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5129 const DebugLoc &DL = MI.getDebugLoc();
5130 MachineOperand &Dest = MI.getOperand(0);
5131 MachineOperand &CarryDest = MI.getOperand(1);
5132 MachineOperand &Src0 = MI.getOperand(2);
5133 MachineOperand &Src1 = MI.getOperand(3);
5134 MachineOperand &Src2 = MI.getOperand(4);
5135 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5136 ? AMDGPU::S_ADDC_U32
5137 : AMDGPU::S_SUBB_U32;
5138 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5139 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5140 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5141 .addReg(Src0.getReg());
5142 Src0.setReg(RegOp0);
5143 }
5144 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5145 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5146 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5147 .addReg(Src1.getReg());
5148 Src1.setReg(RegOp1);
5149 }
5150 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5151 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5152 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5153 .addReg(Src2.getReg());
5154 Src2.setReg(RegOp2);
5155 }
5156
5157 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5158 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5159 assert(WaveSize == 64 || WaveSize == 32);
5160
5161 if (WaveSize == 64) {
5162 if (ST.hasScalarCompareEq64()) {
5163 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5164 .addReg(Src2.getReg())
5165 .addImm(0);
5166 } else {
5167 const TargetRegisterClass *SubRC =
5168 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5169 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5170 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5171 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5172 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5173 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5174
5175 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5176 .add(Src2Sub0)
5177 .add(Src2Sub1);
5178
5179 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5180 .addReg(Src2_32, RegState::Kill)
5181 .addImm(0);
5182 }
5183 } else {
5184 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5185 .addReg(Src2.getReg())
5186 .addImm(0);
5187 }
5188
5189 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5190
5191 unsigned SelOpc =
5192 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5193
5194 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5195 .addImm(-1)
5196 .addImm(0);
5197
5198 MI.eraseFromParent();
5199 return BB;
5200 }
5201 case AMDGPU::SI_INIT_M0: {
5202 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5203 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5204 .add(MI.getOperand(0));
5205 MI.eraseFromParent();
5206 return BB;
5207 }
5208 case AMDGPU::GET_GROUPSTATICSIZE: {
5209 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5210 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5211 DebugLoc DL = MI.getDebugLoc();
5212 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5213 .add(MI.getOperand(0))
5214 .addImm(MFI->getLDSSize());
5215 MI.eraseFromParent();
5216 return BB;
5217 }
5218 case AMDGPU::GET_SHADERCYCLESHILO: {
5221 const DebugLoc &DL = MI.getDebugLoc();
5222 // The algorithm is:
5223 //
5224 // hi1 = getreg(SHADER_CYCLES_HI)
5225 // lo1 = getreg(SHADER_CYCLES_LO)
5226 // hi2 = getreg(SHADER_CYCLES_HI)
5227 //
5228 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5229 // Otherwise there was overflow and the result is hi2:0. In both cases the
5230 // result should represent the actual time at some point during the sequence
5231 // of three getregs.
5232 using namespace AMDGPU::Hwreg;
5233 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5234 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5235 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5236 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5237 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5238 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5239 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5240 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5241 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5242 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5243 .addReg(RegHi1)
5244 .addReg(RegHi2);
5245 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5246 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5247 .addReg(RegLo1)
5248 .addImm(0);
5249 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5250 .add(MI.getOperand(0))
5251 .addReg(RegLo)
5252 .addImm(AMDGPU::sub0)
5253 .addReg(RegHi2)
5254 .addImm(AMDGPU::sub1);
5255 MI.eraseFromParent();
5256 return BB;
5257 }
5258 case AMDGPU::SI_INDIRECT_SRC_V1:
5259 case AMDGPU::SI_INDIRECT_SRC_V2:
5260 case AMDGPU::SI_INDIRECT_SRC_V4:
5261 case AMDGPU::SI_INDIRECT_SRC_V8:
5262 case AMDGPU::SI_INDIRECT_SRC_V9:
5263 case AMDGPU::SI_INDIRECT_SRC_V10:
5264 case AMDGPU::SI_INDIRECT_SRC_V11:
5265 case AMDGPU::SI_INDIRECT_SRC_V12:
5266 case AMDGPU::SI_INDIRECT_SRC_V16:
5267 case AMDGPU::SI_INDIRECT_SRC_V32:
5268 return emitIndirectSrc(MI, *BB, *getSubtarget());
5269 case AMDGPU::SI_INDIRECT_DST_V1:
5270 case AMDGPU::SI_INDIRECT_DST_V2:
5271 case AMDGPU::SI_INDIRECT_DST_V4:
5272 case AMDGPU::SI_INDIRECT_DST_V8:
5273 case AMDGPU::SI_INDIRECT_DST_V9:
5274 case AMDGPU::SI_INDIRECT_DST_V10:
5275 case AMDGPU::SI_INDIRECT_DST_V11:
5276 case AMDGPU::SI_INDIRECT_DST_V12:
5277 case AMDGPU::SI_INDIRECT_DST_V16:
5278 case AMDGPU::SI_INDIRECT_DST_V32:
5279 return emitIndirectDst(MI, *BB, *getSubtarget());
5280 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5281 case AMDGPU::SI_KILL_I1_PSEUDO:
5282 return splitKillBlock(MI, BB);
5283 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5285 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5286 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5287
5288 Register Dst = MI.getOperand(0).getReg();
5289 const MachineOperand &Src0 = MI.getOperand(1);
5290 const MachineOperand &Src1 = MI.getOperand(2);
5291 const DebugLoc &DL = MI.getDebugLoc();
5292 Register SrcCond = MI.getOperand(3).getReg();
5293
5294 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5295 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5296 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5297 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5298
5299 const TargetRegisterClass *Src0RC = Src0.isReg()
5300 ? MRI.getRegClass(Src0.getReg())
5301 : &AMDGPU::VReg_64RegClass;
5302 const TargetRegisterClass *Src1RC = Src1.isReg()
5303 ? MRI.getRegClass(Src1.getReg())
5304 : &AMDGPU::VReg_64RegClass;
5305
5306 const TargetRegisterClass *Src0SubRC =
5307 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5308 const TargetRegisterClass *Src1SubRC =
5309 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5310
5311 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5312 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5313 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5314 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5315
5316 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5317 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5318 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5319 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5320
5321 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5322 .addReg(SrcCond);
5323 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5324 .addImm(0)
5325 .add(Src0Sub0)
5326 .addImm(0)
5327 .add(Src1Sub0)
5328 .addReg(SrcCondCopy);
5329 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5330 .addImm(0)
5331 .add(Src0Sub1)
5332 .addImm(0)
5333 .add(Src1Sub1)
5334 .addReg(SrcCondCopy);
5335
5336 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5337 .addReg(DstLo)
5338 .addImm(AMDGPU::sub0)
5339 .addReg(DstHi)
5340 .addImm(AMDGPU::sub1);
5341 MI.eraseFromParent();
5342 return BB;
5343 }
5344 case AMDGPU::SI_BR_UNDEF: {
5346 const DebugLoc &DL = MI.getDebugLoc();
5347 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5348 .add(MI.getOperand(0));
5349 Br->getOperand(1).setIsUndef(); // read undef SCC
5350 MI.eraseFromParent();
5351 return BB;
5352 }
5353 case AMDGPU::ADJCALLSTACKUP:
5354 case AMDGPU::ADJCALLSTACKDOWN: {
5356 MachineInstrBuilder MIB(*MF, &MI);
5357 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5358 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5359 return BB;
5360 }
5361 case AMDGPU::SI_CALL_ISEL: {
5363 const DebugLoc &DL = MI.getDebugLoc();
5364
5365 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5366
5368 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5369
5370 for (const MachineOperand &MO : MI.operands())
5371 MIB.add(MO);
5372
5373 MIB.cloneMemRefs(MI);
5374 MI.eraseFromParent();
5375 return BB;
5376 }
5377 case AMDGPU::V_ADD_CO_U32_e32:
5378 case AMDGPU::V_SUB_CO_U32_e32:
5379 case AMDGPU::V_SUBREV_CO_U32_e32: {
5380 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5381 const DebugLoc &DL = MI.getDebugLoc();
5382 unsigned Opc = MI.getOpcode();
5383
5384 bool NeedClampOperand = false;
5385 if (TII->pseudoToMCOpcode(Opc) == -1) {
5386 Opc = AMDGPU::getVOPe64(Opc);
5387 NeedClampOperand = true;
5388 }
5389
5390 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5391 if (TII->isVOP3(*I)) {
5392 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5393 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5394 I.addReg(TRI->getVCC(), RegState::Define);
5395 }
5396 I.add(MI.getOperand(1))
5397 .add(MI.getOperand(2));
5398 if (NeedClampOperand)
5399 I.addImm(0); // clamp bit for e64 encoding
5400
5401 TII->legalizeOperands(*I);
5402
5403 MI.eraseFromParent();
5404 return BB;
5405 }
5406 case AMDGPU::V_ADDC_U32_e32:
5407 case AMDGPU::V_SUBB_U32_e32:
5408 case AMDGPU::V_SUBBREV_U32_e32:
5409 // These instructions have an implicit use of vcc which counts towards the
5410 // constant bus limit.
5411 TII->legalizeOperands(MI);
5412 return BB;
5413 case AMDGPU::DS_GWS_INIT:
5414 case AMDGPU::DS_GWS_SEMA_BR:
5415 case AMDGPU::DS_GWS_BARRIER:
5416 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5417 [[fallthrough]];
5418 case AMDGPU::DS_GWS_SEMA_V:
5419 case AMDGPU::DS_GWS_SEMA_P:
5420 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5421 // A s_waitcnt 0 is required to be the instruction immediately following.
5422 if (getSubtarget()->hasGWSAutoReplay()) {
5424 return BB;
5425 }
5426
5427 return emitGWSMemViolTestLoop(MI, BB);
5428 case AMDGPU::S_SETREG_B32: {
5429 // Try to optimize cases that only set the denormal mode or rounding mode.
5430 //
5431 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5432 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5433 // instead.
5434 //
5435 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5436 // allow you to have a no side effect instruction in the output of a
5437 // sideeffecting pattern.
5438 auto [ID, Offset, Width] =
5439 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5440 if (ID != AMDGPU::Hwreg::ID_MODE)
5441 return BB;
5442
5443 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5444 const unsigned SetMask = WidthMask << Offset;
5445
5446 if (getSubtarget()->hasDenormModeInst()) {
5447 unsigned SetDenormOp = 0;
5448 unsigned SetRoundOp = 0;
5449
5450 // The dedicated instructions can only set the whole denorm or round mode
5451 // at once, not a subset of bits in either.
5452 if (SetMask ==
5454 // If this fully sets both the round and denorm mode, emit the two
5455 // dedicated instructions for these.
5456 SetRoundOp = AMDGPU::S_ROUND_MODE;
5457 SetDenormOp = AMDGPU::S_DENORM_MODE;
5458 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5459 SetRoundOp = AMDGPU::S_ROUND_MODE;
5460 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5461 SetDenormOp = AMDGPU::S_DENORM_MODE;
5462 }
5463
5464 if (SetRoundOp || SetDenormOp) {
5466 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5467 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5468 unsigned ImmVal = Def->getOperand(1).getImm();
5469 if (SetRoundOp) {
5470 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5471 .addImm(ImmVal & 0xf);
5472
5473 // If we also have the denorm mode, get just the denorm mode bits.
5474 ImmVal >>= 4;
5475 }
5476
5477 if (SetDenormOp) {
5478 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5479 .addImm(ImmVal & 0xf);
5480 }
5481
5482 MI.eraseFromParent();
5483 return BB;
5484 }
5485 }
5486 }
5487
5488 // If only FP bits are touched, used the no side effects pseudo.
5489 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5490 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5491 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5492
5493 return BB;
5494 }
5495 case AMDGPU::S_INVERSE_BALLOT_U32:
5496 case AMDGPU::S_INVERSE_BALLOT_U64:
5497 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5498 // necessary. After that they are equivalent to a COPY.
5499 MI.setDesc(TII->get(AMDGPU::COPY));
5500 return BB;
5501 case AMDGPU::ENDPGM_TRAP: {
5502 const DebugLoc &DL = MI.getDebugLoc();
5503 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5504 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5505 MI.addOperand(MachineOperand::CreateImm(0));
5506 return BB;
5507 }
5508
5509 // We need a block split to make the real endpgm a terminator. We also don't
5510 // want to break phis in successor blocks, so we can't just delete to the
5511 // end of the block.
5512
5513 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5515 MF->push_back(TrapBB);
5516 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5517 .addImm(0);
5518 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5519 .addMBB(TrapBB);
5520
5521 BB->addSuccessor(TrapBB);
5522 MI.eraseFromParent();
5523 return SplitBB;
5524 }
5525 case AMDGPU::SIMULATED_TRAP: {
5526 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5528 MachineBasicBlock *SplitBB =
5529 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5530 MI.eraseFromParent();
5531 return SplitBB;
5532 }
5533 default:
5534 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5535 if (!MI.mayStore())
5537 return BB;
5538 }
5540 }
5541}
5542
5544 // This currently forces unfolding various combinations of fsub into fma with
5545 // free fneg'd operands. As long as we have fast FMA (controlled by
5546 // isFMAFasterThanFMulAndFAdd), we should perform these.
5547
5548 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5549 // most of these combines appear to be cycle neutral but save on instruction
5550 // count / code size.
5551 return true;
5552}
5553
5555
5557 EVT VT) const {
5558 if (!VT.isVector()) {
5559 return MVT::i1;
5560 }
5561 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5562}
5563
5565 // TODO: Should i16 be used always if legal? For now it would force VALU
5566 // shifts.
5567 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5568}
5569
5571 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5572 ? Ty.changeElementSize(16)
5573 : Ty.changeElementSize(32);
5574}
5575
5576// Answering this is somewhat tricky and depends on the specific device which
5577// have different rates for fma or all f64 operations.
5578//
5579// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5580// regardless of which device (although the number of cycles differs between
5581// devices), so it is always profitable for f64.
5582//
5583// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5584// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5585// which we can always do even without fused FP ops since it returns the same
5586// result as the separate operations and since it is always full
5587// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5588// however does not support denormals, so we do report fma as faster if we have
5589// a fast fma device and require denormals.
5590//
5592 EVT VT) const {
5593 VT = VT.getScalarType();
5594
5595 switch (VT.getSimpleVT().SimpleTy) {
5596 case MVT::f32: {
5597 // If mad is not available this depends only on if f32 fma is full rate.
5598 if (!Subtarget->hasMadMacF32Insts())
5599 return Subtarget->hasFastFMAF32();
5600
5601 // Otherwise f32 mad is always full rate and returns the same result as
5602 // the separate operations so should be preferred over fma.
5603 // However does not support denormals.
5605 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5606
5607 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5608 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5609 }
5610 case MVT::f64:
5611 return true;
5612 case MVT::f16:
5613 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5614 default:
5615 break;
5616 }
5617
5618 return false;
5619}
5620
5622 LLT Ty) const {
5623 switch (Ty.getScalarSizeInBits()) {
5624 case 16:
5625 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5626 case 32:
5627 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5628 case 64:
5629 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5630 default:
5631 break;
5632 }
5633
5634 return false;
5635}
5636
5638 if (!Ty.isScalar())
5639 return false;
5640
5641 if (Ty.getScalarSizeInBits() == 16)
5642 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5643 if (Ty.getScalarSizeInBits() == 32)
5644 return Subtarget->hasMadMacF32Insts() &&
5645 denormalModeIsFlushAllF32(*MI.getMF());
5646
5647 return false;
5648}
5649
5651 const SDNode *N) const {
5652 // TODO: Check future ftz flag
5653 // v_mad_f32/v_mac_f32 do not support denormals.
5654 EVT VT = N->getValueType(0);
5655 if (VT == MVT::f32)
5656 return Subtarget->hasMadMacF32Insts() &&
5658 if (VT == MVT::f16) {
5659 return Subtarget->hasMadF16() &&
5661 }
5662
5663 return false;
5664}
5665
5666//===----------------------------------------------------------------------===//
5667// Custom DAG Lowering Operations
5668//===----------------------------------------------------------------------===//
5669
5670// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5671// wider vector type is legal.
5673 SelectionDAG &DAG) const {
5674 unsigned Opc = Op.getOpcode();
5675 EVT VT = Op.getValueType();
5676 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5677 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5678 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5679 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5680
5681 SDValue Lo, Hi;
5682 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5683
5684 SDLoc SL(Op);
5685 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5686 Op->getFlags());
5687 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5688 Op->getFlags());
5689
5690 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5691}
5692
5693// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5694// wider vector type is legal.
5696 SelectionDAG &DAG) const {
5697 unsigned Opc = Op.getOpcode();
5698 EVT VT = Op.getValueType();
5699 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5700 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5701 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5702 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5703
5704 SDValue Lo0, Hi0;
5705 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5706 SDValue Lo1, Hi1;
5707 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5708
5709 SDLoc SL(Op);
5710
5711 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5712 Op->getFlags());
5713 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5714 Op->getFlags());
5715
5716 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5717}
5718
5720 SelectionDAG &DAG) const {
5721 unsigned Opc = Op.getOpcode();
5722 EVT VT = Op.getValueType();
5723 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5724 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5725 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5726 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5727 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5728 VT == MVT::v32bf16);
5729
5730 SDValue Lo0, Hi0;
5731 SDValue Op0 = Op.getOperand(0);
5732 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5733 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5734 : std::pair(Op0, Op0);
5735 SDValue Lo1, Hi1;
5736 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5737 SDValue Lo2, Hi2;
5738 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5739
5740 SDLoc SL(Op);
5741 auto ResVT = DAG.GetSplitDestVTs(VT);
5742
5743 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5744 Op->getFlags());
5745 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5746 Op->getFlags());
5747
5748 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5749}
5750
5751
5753 switch (Op.getOpcode()) {
5754 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5755 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5756 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5757 case ISD::LOAD: {
5758 SDValue Result = LowerLOAD(Op, DAG);
5759 assert((!Result.getNode() ||
5760 Result.getNode()->getNumValues() == 2) &&
5761 "Load should return a value and a chain");
5762 return Result;
5763 }
5764 case ISD::FSQRT: {
5765 EVT VT = Op.getValueType();
5766 if (VT == MVT::f32)
5767 return lowerFSQRTF32(Op, DAG);
5768 if (VT == MVT::f64)
5769 return lowerFSQRTF64(Op, DAG);
5770 return SDValue();
5771 }
5772 case ISD::FSIN:
5773 case ISD::FCOS:
5774 return LowerTrig(Op, DAG);
5775 case ISD::SELECT: return LowerSELECT(Op, DAG);
5776 case ISD::FDIV: return LowerFDIV(Op, DAG);
5777 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5778 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5779 case ISD::STORE: return LowerSTORE(Op, DAG);
5780 case ISD::GlobalAddress: {
5783 return LowerGlobalAddress(MFI, Op, DAG);
5784 }
5785 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5786 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5787 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5788 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5790 return lowerINSERT_SUBVECTOR(Op, DAG);
5792 return lowerINSERT_VECTOR_ELT(Op, DAG);
5794 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5796 return lowerVECTOR_SHUFFLE(Op, DAG);
5798 return lowerSCALAR_TO_VECTOR(Op, DAG);
5799 case ISD::BUILD_VECTOR:
5800 return lowerBUILD_VECTOR(Op, DAG);
5801 case ISD::FP_ROUND:
5803 return lowerFP_ROUND(Op, DAG);
5804 case ISD::FPTRUNC_ROUND: {
5805 unsigned Opc;
5806 SDLoc DL(Op);
5807
5808 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5809 return SDValue();
5810
5811 // Get the rounding mode from the last operand
5812 int RoundMode = Op.getConstantOperandVal(1);
5813 if (RoundMode == (int)RoundingMode::TowardPositive)
5815 else if (RoundMode == (int)RoundingMode::TowardNegative)
5817 else
5818 return SDValue();
5819
5820 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5821 }
5822 case ISD::TRAP:
5823 return lowerTRAP(Op, DAG);
5824 case ISD::DEBUGTRAP:
5825 return lowerDEBUGTRAP(Op, DAG);
5826 case ISD::ABS:
5827 case ISD::FABS:
5828 case ISD::FNEG:
5829 case ISD::FCANONICALIZE:
5830 case ISD::BSWAP:
5831 return splitUnaryVectorOp(Op, DAG);
5832 case ISD::FMINNUM:
5833 case ISD::FMAXNUM:
5834 return lowerFMINNUM_FMAXNUM(Op, DAG);
5835 case ISD::FLDEXP:
5836 case ISD::STRICT_FLDEXP:
5837 return lowerFLDEXP(Op, DAG);
5838 case ISD::FMA:
5839 return splitTernaryVectorOp(Op, DAG);
5840 case ISD::FP_TO_SINT:
5841 case ISD::FP_TO_UINT:
5842 return LowerFP_TO_INT(Op, DAG);
5843 case ISD::SHL:
5844 case ISD::SRA:
5845 case ISD::SRL:
5846 case ISD::ADD:
5847 case ISD::SUB:
5848 case ISD::SMIN:
5849 case ISD::SMAX:
5850 case ISD::UMIN:
5851 case ISD::UMAX:
5852 case ISD::FADD:
5853 case ISD::FMUL:
5854 case ISD::FMINNUM_IEEE:
5855 case ISD::FMAXNUM_IEEE:
5856 case ISD::FMINIMUM:
5857 case ISD::FMAXIMUM:
5858 case ISD::UADDSAT:
5859 case ISD::USUBSAT:
5860 case ISD::SADDSAT:
5861 case ISD::SSUBSAT:
5862 return splitBinaryVectorOp(Op, DAG);
5863 case ISD::MUL:
5864 return lowerMUL(Op, DAG);
5865 case ISD::SMULO:
5866 case ISD::UMULO:
5867 return lowerXMULO(Op, DAG);
5868 case ISD::SMUL_LOHI:
5869 case ISD::UMUL_LOHI:
5870 return lowerXMUL_LOHI(Op, DAG);
5871 case ISD::DYNAMIC_STACKALLOC:
5872 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5873 case ISD::STACKSAVE:
5874 return LowerSTACKSAVE(Op, DAG);
5875 case ISD::GET_ROUNDING:
5876 return lowerGET_ROUNDING(Op, DAG);
5877 case ISD::SET_ROUNDING:
5878 return lowerSET_ROUNDING(Op, DAG);
5879 case ISD::PREFETCH:
5880 return lowerPREFETCH(Op, DAG);
5881 case ISD::FP_EXTEND:
5883 return lowerFP_EXTEND(Op, DAG);
5884 case ISD::GET_FPENV:
5885 return lowerGET_FPENV(Op, DAG);
5886 case ISD::SET_FPENV:
5887 return lowerSET_FPENV(Op, DAG);
5888 }
5889 return SDValue();
5890}
5891
5892// Used for D16: Casts the result of an instruction into the right vector,
5893// packs values if loads return unpacked values.
5895 const SDLoc &DL,
5896 SelectionDAG &DAG, bool Unpacked) {
5897 if (!LoadVT.isVector())
5898 return Result;
5899
5900 // Cast back to the original packed type or to a larger type that is a
5901 // multiple of 32 bit for D16. Widening the return type is a required for
5902 // legalization.
5903 EVT FittingLoadVT = LoadVT;
5904 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5905 FittingLoadVT =
5907 LoadVT.getVectorNumElements() + 1);
5908 }
5909
5910 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5911 // Truncate to v2i16/v4i16.
5912 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5913
5914 // Workaround legalizer not scalarizing truncate after vector op
5915 // legalization but not creating intermediate vector trunc.
5917 DAG.ExtractVectorElements(Result, Elts);
5918 for (SDValue &Elt : Elts)
5919 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5920
5921 // Pad illegal v1i16/v3fi6 to v4i16
5922 if ((LoadVT.getVectorNumElements() % 2) == 1)
5923 Elts.push_back(DAG.getUNDEF(MVT::i16));
5924
5925 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5926
5927 // Bitcast to original type (v2f16/v4f16).
5928 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5929 }
5930
5931 // Cast back to the original packed type.
5932 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5933}
5934
5935SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5936 MemSDNode *M,
5937 SelectionDAG &DAG,
5939 bool IsIntrinsic) const {
5940 SDLoc DL(M);
5941
5942 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5943 EVT LoadVT = M->getValueType(0);
5944
5945 EVT EquivLoadVT = LoadVT;
5946 if (LoadVT.isVector()) {
5947 if (Unpacked) {
5948 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5949 LoadVT.getVectorNumElements());
5950 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5951 // Widen v3f16 to legal type
5952 EquivLoadVT =
5954 LoadVT.getVectorNumElements() + 1);
5955 }
5956 }
5957
5958 // Change from v4f16/v2f16 to EquivLoadVT.
5959 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5960
5962 = DAG.getMemIntrinsicNode(
5963 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5964 VTList, Ops, M->getMemoryVT(),
5965 M->getMemOperand());
5966
5967 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5968
5969 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5970}
5971
5972SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5973 SelectionDAG &DAG,
5974 ArrayRef<SDValue> Ops) const {
5975 SDLoc DL(M);
5976 EVT LoadVT = M->getValueType(0);
5977 EVT EltType = LoadVT.getScalarType();
5978 EVT IntVT = LoadVT.changeTypeToInteger();
5979
5980 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5981
5982 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5983 bool IsTFE = M->getNumValues() == 3;
5984
5985 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
5987 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
5988 : AMDGPUISD::BUFFER_LOAD;
5989
5990 if (IsD16) {
5991 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5992 }
5993
5994 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5995 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5996 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
5997 IsTFE);
5998
5999 if (isTypeLegal(LoadVT)) {
6000 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6001 M->getMemOperand(), DAG);
6002 }
6003
6004 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6005 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6006 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6007 M->getMemOperand(), DAG);
6008 return DAG.getMergeValues(
6009 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6010 DL);
6011}
6012
6014 SDNode *N, SelectionDAG &DAG) {
6015 EVT VT = N->getValueType(0);
6016 unsigned CondCode = N->getConstantOperandVal(3);
6017 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6018 return DAG.getUNDEF(VT);
6019
6020 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6021
6022 SDValue LHS = N->getOperand(1);
6023 SDValue RHS = N->getOperand(2);
6024
6025 SDLoc DL(N);
6026
6027 EVT CmpVT = LHS.getValueType();
6028 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6029 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
6031 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6032 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6033 }
6034
6035 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6036
6037 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6038 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6039
6040 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6041 DAG.getCondCode(CCOpcode));
6042 if (VT.bitsEq(CCVT))
6043 return SetCC;
6044 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6045}
6046
6048 SDNode *N, SelectionDAG &DAG) {
6049 EVT VT = N->getValueType(0);
6050
6051 unsigned CondCode = N->getConstantOperandVal(3);
6052 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6053 return DAG.getUNDEF(VT);
6054
6055 SDValue Src0 = N->getOperand(1);
6056 SDValue Src1 = N->getOperand(2);
6057 EVT CmpVT = Src0.getValueType();
6058 SDLoc SL(N);
6059
6060 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6061 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6062 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6063 }
6064
6065 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6066 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6067 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6068 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6069 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
6070 Src1, DAG.getCondCode(CCOpcode));
6071 if (VT.bitsEq(CCVT))
6072 return SetCC;
6073 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6074}
6075
6077 SelectionDAG &DAG) {
6078 EVT VT = N->getValueType(0);
6079 SDValue Src = N->getOperand(1);
6080 SDLoc SL(N);
6081
6082 if (Src.getOpcode() == ISD::SETCC) {
6083 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6084 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6085 Src.getOperand(1), Src.getOperand(2));
6086 }
6087 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6088 // (ballot 0) -> 0
6089 if (Arg->isZero())
6090 return DAG.getConstant(0, SL, VT);
6091
6092 // (ballot 1) -> EXEC/EXEC_LO
6093 if (Arg->isOne()) {
6094 Register Exec;
6095 if (VT.getScalarSizeInBits() == 32)
6096 Exec = AMDGPU::EXEC_LO;
6097 else if (VT.getScalarSizeInBits() == 64)
6098 Exec = AMDGPU::EXEC;
6099 else
6100 return SDValue();
6101
6102 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6103 }
6104 }
6105
6106 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6107 // ISD::SETNE)
6108 return DAG.getNode(
6109 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6110 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6111}
6112
6114 SelectionDAG &DAG) {
6115 EVT VT = N->getValueType(0);
6116 unsigned ValSize = VT.getSizeInBits();
6117 unsigned IID = N->getConstantOperandVal(0);
6118 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6119 IID == Intrinsic::amdgcn_permlanex16;
6120 SDLoc SL(N);
6121 MVT IntVT = MVT::getIntegerVT(ValSize);
6122
6123 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6124 SDValue Src2, MVT ValT) -> SDValue {
6126 switch (IID) {
6127 case Intrinsic::amdgcn_permlane16:
6128 case Intrinsic::amdgcn_permlanex16:
6129 Operands.push_back(N->getOperand(6));
6130 Operands.push_back(N->getOperand(5));
6131 Operands.push_back(N->getOperand(4));
6132 [[fallthrough]];
6133 case Intrinsic::amdgcn_writelane:
6134 Operands.push_back(Src2);
6135 [[fallthrough]];
6136 case Intrinsic::amdgcn_readlane:
6137 Operands.push_back(Src1);
6138 [[fallthrough]];
6139 case Intrinsic::amdgcn_readfirstlane:
6140 case Intrinsic::amdgcn_permlane64:
6141 Operands.push_back(Src0);
6142 break;
6143 default:
6144 llvm_unreachable("unhandled lane op");
6145 }
6146
6147 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6148 std::reverse(Operands.begin(), Operands.end());
6149
6150 if (SDNode *GL = N->getGluedNode()) {
6151 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6152 GL = GL->getOperand(0).getNode();
6153 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6154 SDValue(GL, 0)));
6155 }
6156
6157 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6158 };
6159
6160 SDValue Src0 = N->getOperand(1);
6161 SDValue Src1, Src2;
6162 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6163 IsPermLane16) {
6164 Src1 = N->getOperand(2);
6165 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6166 Src2 = N->getOperand(3);
6167 }
6168
6169 if (ValSize == 32) {
6170 // Already legal
6171 return SDValue();
6172 }
6173
6174 if (ValSize < 32) {
6175 bool IsFloat = VT.isFloatingPoint();
6176 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6177 SL, MVT::i32);
6178
6179 if (IsPermLane16) {
6180 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6181 SL, MVT::i32);
6182 }
6183
6184 if (IID == Intrinsic::amdgcn_writelane) {
6185 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6186 SL, MVT::i32);
6187 }
6188
6189 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6190 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6191 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6192 }
6193
6194 if (ValSize % 32 != 0)
6195 return SDValue();
6196
6197 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6198 EVT VT = N->getValueType(0);
6199 unsigned NE = VT.getVectorNumElements();
6200 EVT EltVT = VT.getVectorElementType();
6202 unsigned NumOperands = N->getNumOperands();
6203 SmallVector<SDValue, 4> Operands(NumOperands);
6204 SDNode *GL = N->getGluedNode();
6205
6206 // only handle convergencectrl_glue
6208
6209 for (unsigned i = 0; i != NE; ++i) {
6210 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6211 ++j) {
6212 SDValue Operand = N->getOperand(j);
6213 EVT OperandVT = Operand.getValueType();
6214 if (OperandVT.isVector()) {
6215 // A vector operand; extract a single element.
6216 EVT OperandEltVT = OperandVT.getVectorElementType();
6217 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6218 Operand, DAG.getVectorIdxConstant(i, SL));
6219 } else {
6220 // A scalar operand; just use it as is.
6221 Operands[j] = Operand;
6222 }
6223 }
6224
6225 if (GL)
6226 Operands[NumOperands - 1] =
6227 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6228 SDValue(GL->getOperand(0).getNode(), 0));
6229
6230 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6231 }
6232
6233 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6234 return DAG.getBuildVector(VecVT, SL, Scalars);
6235 };
6236
6237 if (VT.isVector()) {
6238 switch (MVT::SimpleValueType EltTy =
6240 case MVT::i32:
6241 case MVT::f32: {
6242 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6243 return unrollLaneOp(LaneOp.getNode());
6244 }
6245 case MVT::i16:
6246 case MVT::f16:
6247 case MVT::bf16: {
6248 MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
6250 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6251 for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6252 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6253 DAG.getConstant(EltIdx, SL, MVT::i32));
6254
6255 if (IsPermLane16)
6256 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6257 DAG.getConstant(EltIdx, SL, MVT::i32));
6258
6259 if (IID == Intrinsic::amdgcn_writelane)
6260 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6261 DAG.getConstant(EltIdx, SL, MVT::i32));
6262
6263 Pieces.push_back(
6264 IsPermLane16
6265 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6266 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6267 EltIdx += 2;
6268 }
6269 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6270 }
6271 default:
6272 // Handle all other cases by bitcasting to i32 vectors
6273 break;
6274 }
6275 }
6276
6277 MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
6278 Src0 = DAG.getBitcast(VecVT, Src0);
6279
6280 if (IsPermLane16)
6281 Src1 = DAG.getBitcast(VecVT, Src1);
6282
6283 if (IID == Intrinsic::amdgcn_writelane)
6284 Src2 = DAG.getBitcast(VecVT, Src2);
6285
6286 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6287 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6288 return DAG.getBitcast(VT, UnrolledLaneOp);
6289}
6290
6293 SelectionDAG &DAG) const {
6294 switch (N->getOpcode()) {
6296 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6297 Results.push_back(Res);
6298 return;
6299 }
6301 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6302 Results.push_back(Res);
6303 return;
6304 }
6306 unsigned IID = N->getConstantOperandVal(0);
6307 switch (IID) {
6308 case Intrinsic::amdgcn_make_buffer_rsrc:
6309 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6310 return;
6311 case Intrinsic::amdgcn_cvt_pkrtz: {
6312 SDValue Src0 = N->getOperand(1);
6313 SDValue Src1 = N->getOperand(2);
6314 SDLoc SL(N);
6315 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6316 Src0, Src1);
6317 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6318 return;
6319 }
6320 case Intrinsic::amdgcn_cvt_pknorm_i16:
6321 case Intrinsic::amdgcn_cvt_pknorm_u16:
6322 case Intrinsic::amdgcn_cvt_pk_i16:
6323 case Intrinsic::amdgcn_cvt_pk_u16: {
6324 SDValue Src0 = N->getOperand(1);
6325 SDValue Src1 = N->getOperand(2);
6326 SDLoc SL(N);
6327 unsigned Opcode;
6328
6329 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6331 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6333 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6335 else
6337
6338 EVT VT = N->getValueType(0);
6339 if (isTypeLegal(VT))
6340 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6341 else {
6342 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6343 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6344 }
6345 return;
6346 }
6347 case Intrinsic::amdgcn_s_buffer_load: {
6348 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6349 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6350 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6351 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6352 // s_buffer_load_i8.
6353 if (!Subtarget->hasScalarSubwordLoads())
6354 return;
6355 SDValue Op = SDValue(N, 0);
6356 SDValue Rsrc = Op.getOperand(1);
6357 SDValue Offset = Op.getOperand(2);
6358 SDValue CachePolicy = Op.getOperand(3);
6359 EVT VT = Op.getValueType();
6360 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6361 SDLoc DL(Op);
6363 const DataLayout &DataLayout = DAG.getDataLayout();
6364 Align Alignment =
6370 VT.getStoreSize(), Alignment);
6371 SDValue LoadVal;
6372 if (!Offset->isDivergent()) {
6373 SDValue Ops[] = {Rsrc, // source register
6374 Offset, CachePolicy};
6375 SDValue BufferLoad =
6377 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6378 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6379 } else {
6380 SDValue Ops[] = {
6381 DAG.getEntryNode(), // Chain
6382 Rsrc, // rsrc
6383 DAG.getConstant(0, DL, MVT::i32), // vindex
6384 {}, // voffset
6385 {}, // soffset
6386 {}, // offset
6387 CachePolicy, // cachepolicy
6388 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6389 };
6390 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6391 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6392 }
6393 Results.push_back(LoadVal);
6394 return;
6395 }
6396 }
6397 break;
6398 }
6400 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6401 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6402 // FIXME: Hacky
6403 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6404 Results.push_back(Res.getOperand(I));
6405 }
6406 } else {
6407 Results.push_back(Res);
6408 Results.push_back(Res.getValue(1));
6409 }
6410 return;
6411 }
6412
6413 break;
6414 }
6415 case ISD::SELECT: {
6416 SDLoc SL(N);
6417 EVT VT = N->getValueType(0);
6418 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6419 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6420 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6421
6422 EVT SelectVT = NewVT;
6423 if (NewVT.bitsLT(MVT::i32)) {
6424 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6425 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6426 SelectVT = MVT::i32;
6427 }
6428
6429 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6430 N->getOperand(0), LHS, RHS);
6431
6432 if (NewVT != SelectVT)
6433 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6434 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6435 return;
6436 }
6437 case ISD::FNEG: {
6438 if (N->getValueType(0) != MVT::v2f16)
6439 break;
6440
6441 SDLoc SL(N);
6442 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6443
6444 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6445 BC,
6446 DAG.getConstant(0x80008000, SL, MVT::i32));
6447 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6448 return;
6449 }
6450 case ISD::FABS: {
6451 if (N->getValueType(0) != MVT::v2f16)
6452 break;
6453
6454 SDLoc SL(N);
6455 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6456
6457 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6458 BC,
6459 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6460 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6461 return;
6462 }
6463 case ISD::FSQRT: {
6464 if (N->getValueType(0) != MVT::f16)
6465 break;
6466 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6467 break;
6468 }
6469 default:
6471 break;
6472 }
6473}
6474
6475/// Helper function for LowerBRCOND
6476static SDNode *findUser(SDValue Value, unsigned Opcode) {
6477
6478 SDNode *Parent = Value.getNode();
6479 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6480 I != E; ++I) {
6481
6482 if (I.getUse().get() != Value)
6483 continue;
6484
6485 if (I->getOpcode() == Opcode)
6486 return *I;
6487 }
6488 return nullptr;
6489}
6490
6491unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6492 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6493 switch (Intr->getConstantOperandVal(1)) {
6494 case Intrinsic::amdgcn_if:
6495 return AMDGPUISD::IF;
6496 case Intrinsic::amdgcn_else:
6497 return AMDGPUISD::ELSE;
6498 case Intrinsic::amdgcn_loop:
6499 return AMDGPUISD::LOOP;
6500 case Intrinsic::amdgcn_end_cf:
6501 llvm_unreachable("should not occur");
6502 default:
6503 return 0;
6504 }
6505 }
6506
6507 // break, if_break, else_break are all only used as inputs to loop, not
6508 // directly as branch conditions.
6509 return 0;
6510}
6511
6518
6520 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6521 return false;
6522
6523 // FIXME: Either avoid relying on address space here or change the default
6524 // address space for functions to avoid the explicit check.
6525 return (GV->getValueType()->isFunctionTy() ||
6528}
6529
6531 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6532}
6533
6535 if (!GV->hasExternalLinkage())
6536 return true;
6537
6538 const auto OS = getTargetMachine().getTargetTriple().getOS();
6539 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6540}
6541
6542/// This transforms the control flow intrinsics to get the branch destination as
6543/// last parameter, also switches branch target with BR if the need arise
6544SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6545 SelectionDAG &DAG) const {
6546 SDLoc DL(BRCOND);
6547
6548 SDNode *Intr = BRCOND.getOperand(1).getNode();
6549 SDValue Target = BRCOND.getOperand(2);
6550 SDNode *BR = nullptr;
6551 SDNode *SetCC = nullptr;
6552
6553 if (Intr->getOpcode() == ISD::SETCC) {
6554 // As long as we negate the condition everything is fine
6555 SetCC = Intr;
6556 Intr = SetCC->getOperand(0).getNode();
6557
6558 } else {
6559 // Get the target from BR if we don't negate the condition
6560 BR = findUser(BRCOND, ISD::BR);
6561 assert(BR && "brcond missing unconditional branch user");
6562 Target = BR->getOperand(1);
6563 }
6564
6565 unsigned CFNode = isCFIntrinsic(Intr);
6566 if (CFNode == 0) {
6567 // This is a uniform branch so we don't need to legalize.
6568 return BRCOND;
6569 }
6570
6571 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6572 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6573
6574 assert(!SetCC ||
6575 (SetCC->getConstantOperandVal(1) == 1 &&
6576 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6577 ISD::SETNE));
6578
6579 // operands of the new intrinsic call
6581 if (HaveChain)
6582 Ops.push_back(BRCOND.getOperand(0));
6583
6584 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6585 Ops.push_back(Target);
6586
6587 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6588
6589 // build the new intrinsic call
6590 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6591
6592 if (!HaveChain) {
6593 SDValue Ops[] = {
6594 SDValue(Result, 0),
6595 BRCOND.getOperand(0)
6596 };
6597
6598 Result = DAG.getMergeValues(Ops, DL).getNode();
6599 }
6600
6601 if (BR) {
6602 // Give the branch instruction our target
6603 SDValue Ops[] = {
6604 BR->getOperand(0),
6605 BRCOND.getOperand(2)
6606 };
6607 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6608 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6609 }
6610
6611 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6612
6613 // Copy the intrinsic results to registers
6614 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6616 if (!CopyToReg)
6617 continue;
6618
6619 Chain = DAG.getCopyToReg(
6620 Chain, DL,
6621 CopyToReg->getOperand(1),
6622 SDValue(Result, i - 1),
6623 SDValue());
6624
6625 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6626 }
6627
6628 // Remove the old intrinsic from the chain
6630 SDValue(Intr, Intr->getNumValues() - 1),
6631 Intr->getOperand(0));
6632
6633 return Chain;
6634}
6635
6636SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6637 SelectionDAG &DAG) const {
6638 MVT VT = Op.getSimpleValueType();
6639 SDLoc DL(Op);
6640 // Checking the depth
6641 if (Op.getConstantOperandVal(0) != 0)
6642 return DAG.getConstant(0, DL, VT);
6643
6646 // Check for kernel and shader functions
6647 if (Info->isEntryFunction())
6648 return DAG.getConstant(0, DL, VT);
6649
6650 MachineFrameInfo &MFI = MF.getFrameInfo();
6651 // There is a call to @llvm.returnaddress in this function
6652 MFI.setReturnAddressIsTaken(true);
6653
6655 // Get the return address reg and mark it as an implicit live-in
6656 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6657
6658 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6659}
6660
6661SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6662 SDValue Op,
6663 const SDLoc &DL,
6664 EVT VT) const {
6665 return Op.getValueType().bitsLE(VT) ?
6666 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6667 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6668 DAG.getTargetConstant(0, DL, MVT::i32));
6669}
6670
6671SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6672 assert(Op.getValueType() == MVT::f16 &&
6673 "Do not know how to custom lower FP_ROUND for non-f16 type");
6674
6675 SDValue Src = Op.getOperand(0);
6676 EVT SrcVT = Src.getValueType();
6677 if (SrcVT != MVT::f64)
6678 return Op;
6679
6680 // TODO: Handle strictfp
6681 if (Op.getOpcode() != ISD::FP_ROUND)
6682 return Op;
6683
6684 SDLoc DL(Op);
6685
6686 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6687 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6688 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6689}
6690
6691SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6692 SelectionDAG &DAG) const {
6693 EVT VT = Op.getValueType();
6694 const MachineFunction &MF = DAG.getMachineFunction();
6696 bool IsIEEEMode = Info->getMode().IEEE;
6697
6698 // FIXME: Assert during selection that this is only selected for
6699 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6700 // mode functions, but this happens to be OK since it's only done in cases
6701 // where there is known no sNaN.
6702 if (IsIEEEMode)
6703 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6704
6705 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6706 VT == MVT::v16bf16)
6707 return splitBinaryVectorOp(Op, DAG);
6708 return Op;
6709}
6710
6711SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6712 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6713 EVT VT = Op.getValueType();
6714 assert(VT == MVT::f16);
6715
6716 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6717 EVT ExpVT = Exp.getValueType();
6718 if (ExpVT == MVT::i16)
6719 return Op;
6720
6721 SDLoc DL(Op);
6722
6723 // Correct the exponent type for f16 to i16.
6724 // Clamp the range of the exponent to the instruction's range.
6725
6726 // TODO: This should be a generic narrowing legalization, and can easily be
6727 // for GlobalISel.
6728
6729 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6730 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6731
6732 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6733 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6734
6735 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6736
6737 if (IsStrict) {
6738 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6739 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6740 }
6741
6742 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6743}
6744
6745// Custom lowering for vector multiplications and s_mul_u64.
6746SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6747 EVT VT = Op.getValueType();
6748
6749 // Split vector operands.
6750 if (VT.isVector())
6751 return splitBinaryVectorOp(Op, DAG);
6752
6753 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6754
6755 // There are four ways to lower s_mul_u64:
6756 //
6757 // 1. If all the operands are uniform, then we lower it as it is.
6758 //
6759 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6760 // multiplications because there is not a vector equivalent of s_mul_u64.
6761 //
6762 // 3. If the cost model decides that it is more efficient to use vector
6763 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6764 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6765 //
6766 // 4. If the cost model decides to use vector registers and both of the
6767 // operands are zero-extended/sign-extended from 32-bits, then we split the
6768 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6769 // possible to check if the operands are zero-extended or sign-extended in
6770 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6771 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6772 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6773 // If the cost model decides that we have to use vector registers, then
6774 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6775 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6776 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6777 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6778 // SIInstrInfo.cpp .
6779
6780 if (Op->isDivergent())
6781 return SDValue();
6782
6783 SDValue Op0 = Op.getOperand(0);
6784 SDValue Op1 = Op.getOperand(1);
6785 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6786 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6787 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6788 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6789 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6790 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6791 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6792 SDLoc SL(Op);
6793 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6794 return SDValue(
6795 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6796 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6797 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6798 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6799 return SDValue(
6800 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6801 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6802 return Op;
6803}
6804
6805SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6806 EVT VT = Op.getValueType();
6807 SDLoc SL(Op);
6808 SDValue LHS = Op.getOperand(0);
6809 SDValue RHS = Op.getOperand(1);
6810 bool isSigned = Op.getOpcode() == ISD::SMULO;
6811
6812 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6813 const APInt &C = RHSC->getAPIntValue();
6814 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6815 if (C.isPowerOf2()) {
6816 // smulo(x, signed_min) is same as umulo(x, signed_min).
6817 bool UseArithShift = isSigned && !C.isMinSignedValue();
6818 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6819 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6820 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6821 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6822 SL, VT, Result, ShiftAmt),
6823 LHS, ISD::SETNE);
6824 return DAG.getMergeValues({ Result, Overflow }, SL);
6825 }
6826 }
6827
6828 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6830 SL, VT, LHS, RHS);
6831
6832 SDValue Sign = isSigned
6833 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6834 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6835 : DAG.getConstant(0, SL, VT);
6836 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6837
6838 return DAG.getMergeValues({ Result, Overflow }, SL);
6839}
6840
6841SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6842 if (Op->isDivergent()) {
6843 // Select to V_MAD_[IU]64_[IU]32.
6844 return Op;
6845 }
6846 if (Subtarget->hasSMulHi()) {
6847 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6848 return SDValue();
6849 }
6850 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6851 // calculate the high part, so we might as well do the whole thing with
6852 // V_MAD_[IU]64_[IU]32.
6853 return Op;
6854}
6855
6856SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6857 if (!Subtarget->isTrapHandlerEnabled() ||
6859 return lowerTrapEndpgm(Op, DAG);
6860
6861 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6862 lowerTrapHsaQueuePtr(Op, DAG);
6863}
6864
6865SDValue SITargetLowering::lowerTrapEndpgm(
6866 SDValue Op, SelectionDAG &DAG) const {
6867 SDLoc SL(Op);
6868 SDValue Chain = Op.getOperand(0);
6869 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6870}
6871
6872SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6873 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6876 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6878 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6881}
6882
6883SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6884 SDValue Op, SelectionDAG &DAG) const {
6885 SDLoc SL(Op);
6886 SDValue Chain = Op.getOperand(0);
6887
6888 SDValue QueuePtr;
6889 // For code object version 5, QueuePtr is passed through implicit kernarg.
6890 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6892 QueuePtr =
6893 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6894 } else {
6897 Register UserSGPR = Info->getQueuePtrUserSGPR();
6898
6899 if (UserSGPR == AMDGPU::NoRegister) {
6900 // We probably are in a function incorrectly marked with
6901 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6902 // trap, so just use a null pointer.
6903 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6904 } else {
6905 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6906 MVT::i64);
6907 }
6908 }
6909
6910 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6911 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6912 QueuePtr, SDValue());
6913
6915 SDValue Ops[] = {
6916 ToReg,
6917 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6918 SGPR01,
6919 ToReg.getValue(1)
6920 };
6921 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6922}
6923
6924SDValue SITargetLowering::lowerTrapHsa(
6925 SDValue Op, SelectionDAG &DAG) const {
6926 SDLoc SL(Op);
6927 SDValue Chain = Op.getOperand(0);
6928
6929 // We need to simulate the 's_trap 2' instruction on targets that run in
6930 // PRIV=1 (where it is treated as a nop).
6931 if (Subtarget->hasPrivEnabledTrap2NopBug())
6932 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6933
6935 SDValue Ops[] = {
6936 Chain,
6937 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6938 };
6939 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6940}
6941
6942SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6943 SDLoc SL(Op);
6944 SDValue Chain = Op.getOperand(0);
6946
6947 if (!Subtarget->isTrapHandlerEnabled() ||
6950 "debugtrap handler not supported",
6951 Op.getDebugLoc(),
6952 DS_Warning);
6953 LLVMContext &Ctx = MF.getFunction().getContext();
6954 Ctx.diagnose(NoTrap);
6955 return Chain;
6956 }
6957
6959 SDValue Ops[] = {
6960 Chain,
6961 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6962 };
6963 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6964}
6965
6966SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6967 SelectionDAG &DAG) const {
6968 if (Subtarget->hasApertureRegs()) {
6969 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6970 ? AMDGPU::SRC_SHARED_BASE
6971 : AMDGPU::SRC_PRIVATE_BASE;
6972 // Note: this feature (register) is broken. When used as a 32-bit operand,
6973 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6974 // bits.
6975 //
6976 // To work around the issue, directly emit a 64 bit mov from this register
6977 // then extract the high bits. Note that this shouldn't even result in a
6978 // shift being emitted and simply become a pair of registers (e.g.):
6979 // s_mov_b64 s[6:7], src_shared_base
6980 // v_mov_b32_e32 v1, s7
6981 //
6982 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6983 // coalescing would kick in and it would think it's okay to use the "HI"
6984 // subregister directly (instead of extracting the HI 32 bits) which is an
6985 // artificial (unusable) register.
6986 // Register TableGen definitions would need an overhaul to get rid of the
6987 // artificial "HI" aperture registers and prevent this kind of issue from
6988 // happening.
6989 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6990 DAG.getRegister(ApertureRegNo, MVT::i64));
6991 return DAG.getNode(
6992 ISD::TRUNCATE, DL, MVT::i32,
6993 DAG.getNode(ISD::SRL, DL, MVT::i64,
6994 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6995 }
6996
6997 // For code object version 5, private_base and shared_base are passed through
6998 // implicit kernargs.
6999 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7001 ImplicitParameter Param =
7003 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7004 }
7005
7008 Register UserSGPR = Info->getQueuePtrUserSGPR();
7009 if (UserSGPR == AMDGPU::NoRegister) {
7010 // We probably are in a function incorrectly marked with
7011 // amdgpu-no-queue-ptr. This is undefined.
7012 return DAG.getUNDEF(MVT::i32);
7013 }
7014
7015 SDValue QueuePtr = CreateLiveInRegister(
7016 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7017
7018 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7019 // private_segment_aperture_base_hi.
7020 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7021
7022 SDValue Ptr =
7023 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7024
7025 // TODO: Use custom target PseudoSourceValue.
7026 // TODO: We should use the value from the IR intrinsic call, but it might not
7027 // be available and how do we get it?
7029 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7030 commonAlignment(Align(64), StructOffset),
7033}
7034
7035/// Return true if the value is a known valid address, such that a null check is
7036/// not necessary.
7038 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7041 return true;
7042
7043 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7044 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7045
7046 // TODO: Search through arithmetic, handle arguments and loads
7047 // marked nonnull.
7048 return false;
7049}
7050
7051SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7052 SelectionDAG &DAG) const {
7053 SDLoc SL(Op);
7054
7055 const AMDGPUTargetMachine &TM =
7056 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7057
7058 unsigned DestAS, SrcAS;
7059 SDValue Src;
7060 bool IsNonNull = false;
7061 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7062 SrcAS = ASC->getSrcAddressSpace();
7063 Src = ASC->getOperand(0);
7064 DestAS = ASC->getDestAddressSpace();
7065 } else {
7066 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7067 Op.getConstantOperandVal(0) ==
7068 Intrinsic::amdgcn_addrspacecast_nonnull);
7069 Src = Op->getOperand(1);
7070 SrcAS = Op->getConstantOperandVal(2);
7071 DestAS = Op->getConstantOperandVal(3);
7072 IsNonNull = true;
7073 }
7074
7075 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7076
7077 // flat -> local/private
7078 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7079 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7080 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7081 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7082
7083 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7084 return Ptr;
7085
7086 unsigned NullVal = TM.getNullPointerValue(DestAS);
7087 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7088 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7089
7090 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7091 SegmentNullPtr);
7092 }
7093 }
7094
7095 // local/private -> flat
7096 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7097 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7098 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7099
7100 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7101 SDValue CvtPtr =
7102 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7103 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7104
7105 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7106 return CvtPtr;
7107
7108 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7109 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7110
7111 SDValue NonNull
7112 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7113
7114 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7115 FlatNullPtr);
7116 }
7117 }
7118
7119 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7120 Op.getValueType() == MVT::i64) {
7121 const SIMachineFunctionInfo *Info =
7123 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7124 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7125 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7126 }
7127
7128 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7129 Src.getValueType() == MVT::i64)
7130 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7131
7132 // global <-> flat are no-ops and never emitted.
7133
7134 const MachineFunction &MF = DAG.getMachineFunction();
7135 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7136 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7137 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7138
7139 return DAG.getUNDEF(Op->getValueType(0));
7140}
7141
7142// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7143// the small vector and inserting them into the big vector. That is better than
7144// the default expansion of doing it via a stack slot. Even though the use of
7145// the stack slot would be optimized away afterwards, the stack slot itself
7146// remains.
7147SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7148 SelectionDAG &DAG) const {
7149 SDValue Vec = Op.getOperand(0);
7150 SDValue Ins = Op.getOperand(1);
7151 SDValue Idx = Op.getOperand(2);
7152 EVT VecVT = Vec.getValueType();
7153 EVT InsVT = Ins.getValueType();
7154 EVT EltVT = VecVT.getVectorElementType();
7155 unsigned InsNumElts = InsVT.getVectorNumElements();
7156 unsigned IdxVal = Idx->getAsZExtVal();
7157 SDLoc SL(Op);
7158
7159 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7160 // Insert 32-bit registers at a time.
7161 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7162
7163 unsigned VecNumElts = VecVT.getVectorNumElements();
7164 EVT NewVecVT =
7165 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7166 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7168 MVT::i32, InsNumElts / 2);
7169
7170 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7171 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7172
7173 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7174 SDValue Elt;
7175 if (InsNumElts == 2) {
7176 Elt = Ins;
7177 } else {
7178 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7179 DAG.getConstant(I, SL, MVT::i32));
7180 }
7181 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7182 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7183 }
7184
7185 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7186 }
7187
7188 for (unsigned I = 0; I != InsNumElts; ++I) {
7189 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7190 DAG.getConstant(I, SL, MVT::i32));
7191 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7192 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7193 }
7194 return Vec;
7195}
7196
7197SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7198 SelectionDAG &DAG) const {
7199 SDValue Vec = Op.getOperand(0);
7200 SDValue InsVal = Op.getOperand(1);
7201 SDValue Idx = Op.getOperand(2);
7202 EVT VecVT = Vec.getValueType();
7203 EVT EltVT = VecVT.getVectorElementType();
7204 unsigned VecSize = VecVT.getSizeInBits();
7205 unsigned EltSize = EltVT.getSizeInBits();
7206 SDLoc SL(Op);
7207
7208 // Specially handle the case of v4i16 with static indexing.
7209 unsigned NumElts = VecVT.getVectorNumElements();
7210 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
7211 if (NumElts == 4 && EltSize == 16 && KIdx) {
7212 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7213
7214 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7215 DAG.getConstant(0, SL, MVT::i32));
7216 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7217 DAG.getConstant(1, SL, MVT::i32));
7218
7219 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7220 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7221
7222 unsigned Idx = KIdx->getZExtValue();
7223 bool InsertLo = Idx < 2;
7224 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
7225 InsertLo ? LoVec : HiVec,
7226 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7227 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7228
7229 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7230
7231 SDValue Concat = InsertLo ?
7232 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
7233 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
7234
7235 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7236 }
7237
7238 // Static indexing does not lower to stack access, and hence there is no need
7239 // for special custom lowering to avoid stack access.
7240 if (isa<ConstantSDNode>(Idx))
7241 return SDValue();
7242
7243 // Avoid stack access for dynamic indexing by custom lowering to
7244 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7245
7246 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7247
7248 MVT IntVT = MVT::getIntegerVT(VecSize);
7249
7250 // Convert vector index to bit-index and get the required bit mask.
7251 assert(isPowerOf2_32(EltSize));
7252 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7253 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7254 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7255 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7256 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7257
7258 // 1. Create a congruent vector with the target value in each element.
7259 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7260 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7261
7262 // 2. Mask off all other indices except the required index within (1).
7263 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7264
7265 // 3. Mask off the required index within the target vector.
7266 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7267 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
7268 DAG.getNOT(SL, BFM, IntVT), BCVec);
7269
7270 // 4. Get (2) and (3) ORed into the target vector.
7271 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7272
7273 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7274}
7275
7276SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7277 SelectionDAG &DAG) const {
7278 SDLoc SL(Op);
7279
7280 EVT ResultVT = Op.getValueType();
7281 SDValue Vec = Op.getOperand(0);
7282 SDValue Idx = Op.getOperand(1);
7283 EVT VecVT = Vec.getValueType();
7284 unsigned VecSize = VecVT.getSizeInBits();
7285 EVT EltVT = VecVT.getVectorElementType();
7286
7287 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7288
7289 // Make sure we do any optimizations that will make it easier to fold
7290 // source modifiers before obscuring it with bit operations.
7291
7292 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7293 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7294 return Combined;
7295
7296 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7297 SDValue Lo, Hi;
7298 EVT LoVT, HiVT;
7299 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
7300
7301 if (VecSize == 128) {
7302 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7303 Lo = DAG.getBitcast(LoVT,
7304 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7305 DAG.getConstant(0, SL, MVT::i32)));
7306 Hi = DAG.getBitcast(HiVT,
7307 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7308 DAG.getConstant(1, SL, MVT::i32)));
7309 } else if (VecSize == 256) {
7310 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7311 SDValue Parts[4];
7312 for (unsigned P = 0; P < 4; ++P) {
7313 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7314 DAG.getConstant(P, SL, MVT::i32));
7315 }
7316
7317 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7318 Parts[0], Parts[1]));
7319 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7320 Parts[2], Parts[3]));
7321 } else {
7322 assert(VecSize == 512);
7323
7324 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7325 SDValue Parts[8];
7326 for (unsigned P = 0; P < 8; ++P) {
7327 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7328 DAG.getConstant(P, SL, MVT::i32));
7329 }
7330
7331 Lo = DAG.getBitcast(LoVT,
7332 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7333 Parts[0], Parts[1], Parts[2], Parts[3]));
7334 Hi = DAG.getBitcast(HiVT,
7335 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7336 Parts[4], Parts[5],Parts[6], Parts[7]));
7337 }
7338
7339 EVT IdxVT = Idx.getValueType();
7340 unsigned NElem = VecVT.getVectorNumElements();
7341 assert(isPowerOf2_32(NElem));
7342 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7343 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7344 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7345 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7346 }
7347
7348 assert(VecSize <= 64);
7349
7350 MVT IntVT = MVT::getIntegerVT(VecSize);
7351
7352 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7353 SDValue VecBC = peekThroughBitcasts(Vec);
7354 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7355 SDValue Src = VecBC.getOperand(0);
7356 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7357 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7358 }
7359
7360 unsigned EltSize = EltVT.getSizeInBits();
7361 assert(isPowerOf2_32(EltSize));
7362
7363 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7364
7365 // Convert vector index to bit-index (* EltSize)
7366 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7367
7368 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7369 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7370
7371 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7372 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7373 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7374 }
7375
7376 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7377}
7378
7379static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7380 assert(Elt % 2 == 0);
7381 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7382}
7383
7384SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7385 SelectionDAG &DAG) const {
7386 SDLoc SL(Op);
7387 EVT ResultVT = Op.getValueType();
7389
7390 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7391 EVT EltVT = PackVT.getVectorElementType();
7392 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7393
7394 // vector_shuffle <0,1,6,7> lhs, rhs
7395 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7396 //
7397 // vector_shuffle <6,7,2,3> lhs, rhs
7398 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7399 //
7400 // vector_shuffle <6,7,0,1> lhs, rhs
7401 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7402
7403 // Avoid scalarizing when both halves are reading from consecutive elements.
7405 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7406 if (elementPairIsContiguous(SVN->getMask(), I)) {
7407 const int Idx = SVN->getMaskElt(I);
7408 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7409 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7410 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7411 PackVT, SVN->getOperand(VecIdx),
7412 DAG.getConstant(EltIdx, SL, MVT::i32));
7413 Pieces.push_back(SubVec);
7414 } else {
7415 const int Idx0 = SVN->getMaskElt(I);
7416 const int Idx1 = SVN->getMaskElt(I + 1);
7417 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7418 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7419 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7420 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7421
7422 SDValue Vec0 = SVN->getOperand(VecIdx0);
7423 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7424 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7425
7426 SDValue Vec1 = SVN->getOperand(VecIdx1);
7427 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7428 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7429 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7430 }
7431 }
7432
7433 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7434}
7435
7436SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7437 SelectionDAG &DAG) const {
7438 SDValue SVal = Op.getOperand(0);
7439 EVT ResultVT = Op.getValueType();
7440 EVT SValVT = SVal.getValueType();
7441 SDValue UndefVal = DAG.getUNDEF(SValVT);
7442 SDLoc SL(Op);
7443
7445 VElts.push_back(SVal);
7446 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7447 VElts.push_back(UndefVal);
7448
7449 return DAG.getBuildVector(ResultVT, SL, VElts);
7450}
7451
7452SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7453 SelectionDAG &DAG) const {
7454 SDLoc SL(Op);
7455 EVT VT = Op.getValueType();
7456
7457 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7458 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7460 VT.getVectorNumElements() / 2);
7461 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7462
7463 // Turn into pair of packed build_vectors.
7464 // TODO: Special case for constants that can be materialized with s_mov_b64.
7465 SmallVector<SDValue, 4> LoOps, HiOps;
7466 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7467 LoOps.push_back(Op.getOperand(I));
7468 HiOps.push_back(Op.getOperand(I + E));
7469 }
7470 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7471 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7472
7473 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7474 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7475
7476 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7477 { CastLo, CastHi });
7478 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7479 }
7480
7481 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7483 VT.getVectorNumElements() / 4);
7484 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7485
7486 SmallVector<SDValue, 4> Parts[4];
7487 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7488 for (unsigned P = 0; P < 4; ++P)
7489 Parts[P].push_back(Op.getOperand(I + P * E));
7490 }
7491 SDValue Casts[4];
7492 for (unsigned P = 0; P < 4; ++P) {
7493 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7494 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7495 }
7496
7497 SDValue Blend =
7498 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7499 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7500 }
7501
7502 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7504 VT.getVectorNumElements() / 8);
7505 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7506
7507 SmallVector<SDValue, 8> Parts[8];
7508 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7509 for (unsigned P = 0; P < 8; ++P)
7510 Parts[P].push_back(Op.getOperand(I + P * E));
7511 }
7512 SDValue Casts[8];
7513 for (unsigned P = 0; P < 8; ++P) {
7514 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7515 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7516 }
7517
7518 SDValue Blend =
7519 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7520 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7521 }
7522
7523 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7524 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7525
7526 SDValue Lo = Op.getOperand(0);
7527 SDValue Hi = Op.getOperand(1);
7528
7529 // Avoid adding defined bits with the zero_extend.
7530 if (Hi.isUndef()) {
7531 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7532 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7533 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7534 }
7535
7536 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7537 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7538
7539 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7540 DAG.getConstant(16, SL, MVT::i32));
7541 if (Lo.isUndef())
7542 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7543
7544 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7545 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7546
7547 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7548 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7549}
7550
7551bool
7553 // OSes that use ELF REL relocations (instead of RELA) can only store a
7554 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7555 // which can create arbitrary 64-bit addends. (This is only a problem for
7556 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7557 // the high 32 bits of the addend.)
7558 //
7559 // This should be kept in sync with how HasRelocationAddend is initialized in
7560 // the constructor of ELFAMDGPUAsmBackend.
7561 if (!Subtarget->isAmdHsaOS())
7562 return false;
7563
7564 // We can fold offsets for anything that doesn't require a GOT relocation.
7565 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7569}
7570
7571static SDValue
7573 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7574 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7575 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7576 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7577 // lowered to the following code sequence:
7578 //
7579 // For constant address space:
7580 // s_getpc_b64 s[0:1]
7581 // s_add_u32 s0, s0, $symbol
7582 // s_addc_u32 s1, s1, 0
7583 //
7584 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7585 // a fixup or relocation is emitted to replace $symbol with a literal
7586 // constant, which is a pc-relative offset from the encoding of the $symbol
7587 // operand to the global variable.
7588 //
7589 // For global address space:
7590 // s_getpc_b64 s[0:1]
7591 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7592 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7593 //
7594 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7595 // fixups or relocations are emitted to replace $symbol@*@lo and
7596 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7597 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7598 // operand to the global variable.
7599 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7600 SDValue PtrHi;
7601 if (GAFlags == SIInstrInfo::MO_NONE)
7602 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7603 else
7604 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7605 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7606}
7607
7608SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7609 SDValue Op,
7610 SelectionDAG &DAG) const {
7612 SDLoc DL(GSD);
7613 EVT PtrVT = Op.getValueType();
7614
7615 const GlobalValue *GV = GSD->getGlobal();
7621 GV->hasExternalLinkage()) {
7622 Type *Ty = GV->getValueType();
7623 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7624 // zero-sized type in other languages to declare the dynamic shared
7625 // memory which size is not known at the compile time. They will be
7626 // allocated by the runtime and placed directly after the static
7627 // allocated ones. They all share the same offset.
7628 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7629 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7630 // Adjust alignment for that dynamic shared memory array.
7633 MFI->setUsesDynamicLDS(true);
7634 return SDValue(
7635 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7636 }
7637 }
7639 }
7640
7642 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7644 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7645 }
7646
7647 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7648 SDValue AddrLo = DAG.getTargetGlobalAddress(
7649 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7650 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7651
7652 SDValue AddrHi = DAG.getTargetGlobalAddress(
7653 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7654 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7655
7656 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7657 }
7658
7659 if (shouldEmitFixup(GV))
7660 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7661
7662 if (shouldEmitPCReloc(GV))
7663 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7665
7666 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7668
7669 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7671 const DataLayout &DataLayout = DAG.getDataLayout();
7672 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7673 MachinePointerInfo PtrInfo
7675
7676 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7679}
7680
7682 const SDLoc &DL, SDValue V) const {
7683 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7684 // the destination register.
7685 //
7686 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7687 // so we will end up with redundant moves to m0.
7688 //
7689 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7690
7691 // A Null SDValue creates a glue result.
7692 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7693 V, Chain);
7694 return SDValue(M0, 0);
7695}
7696
7697SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7698 SDValue Op,
7699 MVT VT,
7700 unsigned Offset) const {
7701 SDLoc SL(Op);
7702 SDValue Param = lowerKernargMemParameter(
7703 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7704 // The local size values will have the hi 16-bits as zero.
7705 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7706 DAG.getValueType(VT));
7707}
7708
7710 EVT VT) {
7712 "non-hsa intrinsic with hsa target",
7713 DL.getDebugLoc());
7714 DAG.getContext()->diagnose(BadIntrin);
7715 return DAG.getUNDEF(VT);
7716}
7717
7719 EVT VT) {
7721 "intrinsic not supported on subtarget",
7722 DL.getDebugLoc());
7723 DAG.getContext()->diagnose(BadIntrin);
7724 return DAG.getUNDEF(VT);
7725}
7726
7728 ArrayRef<SDValue> Elts) {
7729 assert(!Elts.empty());
7730 MVT Type;
7731 unsigned NumElts = Elts.size();
7732
7733 if (NumElts <= 12) {
7734 Type = MVT::getVectorVT(MVT::f32, NumElts);
7735 } else {
7736 assert(Elts.size() <= 16);
7737 Type = MVT::v16f32;
7738 NumElts = 16;
7739 }
7740
7741 SmallVector<SDValue, 16> VecElts(NumElts);
7742 for (unsigned i = 0; i < Elts.size(); ++i) {
7743 SDValue Elt = Elts[i];
7744 if (Elt.getValueType() != MVT::f32)
7745 Elt = DAG.getBitcast(MVT::f32, Elt);
7746 VecElts[i] = Elt;
7747 }
7748 for (unsigned i = Elts.size(); i < NumElts; ++i)
7749 VecElts[i] = DAG.getUNDEF(MVT::f32);
7750
7751 if (NumElts == 1)
7752 return VecElts[0];
7753 return DAG.getBuildVector(Type, DL, VecElts);
7754}
7755
7756static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7757 SDValue Src, int ExtraElts) {
7758 EVT SrcVT = Src.getValueType();
7759
7761
7762 if (SrcVT.isVector())
7763 DAG.ExtractVectorElements(Src, Elts);
7764 else
7765 Elts.push_back(Src);
7766
7767 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7768 while (ExtraElts--)
7769 Elts.push_back(Undef);
7770
7771 return DAG.getBuildVector(CastVT, DL, Elts);
7772}
7773
7774// Re-construct the required return value for a image load intrinsic.
7775// This is more complicated due to the optional use TexFailCtrl which means the required
7776// return type is an aggregate
7778 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7779 bool Unpacked, bool IsD16, int DMaskPop,
7780 int NumVDataDwords, bool IsAtomicPacked16Bit,
7781 const SDLoc &DL) {
7782 // Determine the required return type. This is the same regardless of IsTexFail flag
7783 EVT ReqRetVT = ResultTypes[0];
7784 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7785 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7786 ? (ReqRetNumElts + 1) / 2
7787 : ReqRetNumElts;
7788
7789 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7790
7791 MVT DataDwordVT = NumDataDwords == 1 ?
7792 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7793
7794 MVT MaskPopVT = MaskPopDwords == 1 ?
7795 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7796
7797 SDValue Data(Result, 0);
7798 SDValue TexFail;
7799
7800 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7801 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7802 if (MaskPopVT.isVector()) {
7803 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7804 SDValue(Result, 0), ZeroIdx);
7805 } else {
7806 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7807 SDValue(Result, 0), ZeroIdx);
7808 }
7809 }
7810
7811 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7812 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7813 NumDataDwords - MaskPopDwords);
7814
7815 if (IsD16)
7816 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7817
7818 EVT LegalReqRetVT = ReqRetVT;
7819 if (!ReqRetVT.isVector()) {
7820 if (!Data.getValueType().isInteger())
7821 Data = DAG.getNode(ISD::BITCAST, DL,
7822 Data.getValueType().changeTypeToInteger(), Data);
7823 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7824 } else {
7825 // We need to widen the return vector to a legal type
7826 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7827 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7828 LegalReqRetVT =
7830 ReqRetVT.getVectorNumElements() + 1);
7831 }
7832 }
7833 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7834
7835 if (IsTexFail) {
7836 TexFail =
7837 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7838 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7839
7840 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7841 }
7842
7843 if (Result->getNumValues() == 1)
7844 return Data;
7845
7846 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7847}
7848
7849static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7850 SDValue *LWE, bool &IsTexFail) {
7851 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7852
7853 uint64_t Value = TexFailCtrlConst->getZExtValue();
7854 if (Value) {
7855 IsTexFail = true;
7856 }
7857
7858 SDLoc DL(TexFailCtrlConst);
7859 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7860 Value &= ~(uint64_t)0x1;
7861 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7862 Value &= ~(uint64_t)0x2;
7863
7864 return Value == 0;
7865}
7866
7868 MVT PackVectorVT,
7869 SmallVectorImpl<SDValue> &PackedAddrs,
7870 unsigned DimIdx, unsigned EndIdx,
7871 unsigned NumGradients) {
7872 SDLoc DL(Op);
7873 for (unsigned I = DimIdx; I < EndIdx; I++) {
7874 SDValue Addr = Op.getOperand(I);
7875
7876 // Gradients are packed with undef for each coordinate.
7877 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7878 // 1D: undef,dx/dh; undef,dx/dv
7879 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7880 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7881 if (((I + 1) >= EndIdx) ||
7882 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7883 I == DimIdx + NumGradients - 1))) {
7884 if (Addr.getValueType() != MVT::i16)
7885 Addr = DAG.getBitcast(MVT::i16, Addr);
7886 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7887 } else {
7888 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7889 I++;
7890 }
7891 Addr = DAG.getBitcast(MVT::f32, Addr);
7892 PackedAddrs.push_back(Addr);
7893 }
7894}
7895
7896SDValue SITargetLowering::lowerImage(SDValue Op,
7898 SelectionDAG &DAG, bool WithChain) const {
7899 SDLoc DL(Op);
7901 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7902 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7904 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7905 unsigned IntrOpcode = Intr->BaseOpcode;
7906 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7907 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7908 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7909
7910 SmallVector<EVT, 3> ResultTypes(Op->values());
7911 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7912 bool IsD16 = false;
7913 bool IsG16 = false;
7914 bool IsA16 = false;
7915 SDValue VData;
7916 int NumVDataDwords = 0;
7917 bool AdjustRetType = false;
7918 bool IsAtomicPacked16Bit = false;
7919
7920 // Offset of intrinsic arguments
7921 const unsigned ArgOffset = WithChain ? 2 : 1;
7922
7923 unsigned DMask;
7924 unsigned DMaskLanes = 0;
7925
7926 if (BaseOpcode->Atomic) {
7927 VData = Op.getOperand(2);
7928
7929 IsAtomicPacked16Bit =
7930 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7931 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7932
7933 bool Is64Bit = VData.getValueSizeInBits() == 64;
7934 if (BaseOpcode->AtomicX2) {
7935 SDValue VData2 = Op.getOperand(3);
7936 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7937 {VData, VData2});
7938 if (Is64Bit)
7939 VData = DAG.getBitcast(MVT::v4i32, VData);
7940
7941 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7942 DMask = Is64Bit ? 0xf : 0x3;
7943 NumVDataDwords = Is64Bit ? 4 : 2;
7944 } else {
7945 DMask = Is64Bit ? 0x3 : 0x1;
7946 NumVDataDwords = Is64Bit ? 2 : 1;
7947 }
7948 } else {
7949 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7950 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7951
7952 if (BaseOpcode->Store) {
7953 VData = Op.getOperand(2);
7954
7955 MVT StoreVT = VData.getSimpleValueType();
7956 if (StoreVT.getScalarType() == MVT::f16) {
7957 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7958 return Op; // D16 is unsupported for this instruction
7959
7960 IsD16 = true;
7961 VData = handleD16VData(VData, DAG, true);
7962 }
7963
7964 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7965 } else if (!BaseOpcode->NoReturn) {
7966 // Work out the num dwords based on the dmask popcount and underlying type
7967 // and whether packing is supported.
7968 MVT LoadVT = ResultTypes[0].getSimpleVT();
7969 if (LoadVT.getScalarType() == MVT::f16) {
7970 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7971 return Op; // D16 is unsupported for this instruction
7972
7973 IsD16 = true;
7974 }
7975
7976 // Confirm that the return type is large enough for the dmask specified
7977 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7978 (!LoadVT.isVector() && DMaskLanes > 1))
7979 return Op;
7980
7981 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7982 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7983 // instructions.
7984 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7985 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7986 NumVDataDwords = (DMaskLanes + 1) / 2;
7987 else
7988 NumVDataDwords = DMaskLanes;
7989
7990 AdjustRetType = true;
7991 }
7992 }
7993
7994 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7996
7997 // Check for 16 bit addresses or derivatives and pack if true.
7998 MVT VAddrVT =
7999 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8000 MVT VAddrScalarVT = VAddrVT.getScalarType();
8001 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8002 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8003
8004 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8005 VAddrScalarVT = VAddrVT.getScalarType();
8006 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8007 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8008
8009 // Push back extra arguments.
8010 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8011 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8012 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8013 // Special handling of bias when A16 is on. Bias is of type half but
8014 // occupies full 32-bit.
8015 SDValue Bias = DAG.getBuildVector(
8016 MVT::v2f16, DL,
8017 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8018 VAddrs.push_back(Bias);
8019 } else {
8020 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8021 "Bias needs to be converted to 16 bit in A16 mode");
8022 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8023 }
8024 }
8025
8026 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8027 // 16 bit gradients are supported, but are tied to the A16 control
8028 // so both gradients and addresses must be 16 bit
8029 LLVM_DEBUG(
8030 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8031 "require 16 bit args for both gradients and addresses");
8032 return Op;
8033 }
8034
8035 if (IsA16) {
8036 if (!ST->hasA16()) {
8037 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8038 "support 16 bit addresses\n");
8039 return Op;
8040 }
8041 }
8042
8043 // We've dealt with incorrect input so we know that if IsA16, IsG16
8044 // are set then we have to compress/pack operands (either address,
8045 // gradient or both)
8046 // In the case where a16 and gradients are tied (no G16 support) then we
8047 // have already verified that both IsA16 and IsG16 are true
8048 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8049 // Activate g16
8050 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8052 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8053 }
8054
8055 // Add gradients (packed or unpacked)
8056 if (IsG16) {
8057 // Pack the gradients
8058 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8059 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8060 ArgOffset + Intr->GradientStart,
8061 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8062 } else {
8063 for (unsigned I = ArgOffset + Intr->GradientStart;
8064 I < ArgOffset + Intr->CoordStart; I++)
8065 VAddrs.push_back(Op.getOperand(I));
8066 }
8067
8068 // Add addresses (packed or unpacked)
8069 if (IsA16) {
8070 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8071 ArgOffset + Intr->CoordStart, VAddrEnd,
8072 0 /* No gradients */);
8073 } else {
8074 // Add uncompressed address
8075 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8076 VAddrs.push_back(Op.getOperand(I));
8077 }
8078
8079 // If the register allocator cannot place the address registers contiguously
8080 // without introducing moves, then using the non-sequential address encoding
8081 // is always preferable, since it saves VALU instructions and is usually a
8082 // wash in terms of code size or even better.
8083 //
8084 // However, we currently have no way of hinting to the register allocator that
8085 // MIMG addresses should be placed contiguously when it is possible to do so,
8086 // so force non-NSA for the common 2-address case as a heuristic.
8087 //
8088 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8089 // allocation when possible.
8090 //
8091 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8092 // set of the remaining addresses.
8093 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8094 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8095 const bool UseNSA = ST->hasNSAEncoding() &&
8096 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8097 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8098 const bool UsePartialNSA =
8099 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8100
8101 SDValue VAddr;
8102 if (UsePartialNSA) {
8103 VAddr = getBuildDwordsVector(DAG, DL,
8104 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8105 }
8106 else if (!UseNSA) {
8107 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8108 }
8109
8110 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8111 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8112 SDValue Unorm;
8113 if (!BaseOpcode->Sampler) {
8114 Unorm = True;
8115 } else {
8116 uint64_t UnormConst =
8117 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8118
8119 Unorm = UnormConst ? True : False;
8120 }
8121
8122 SDValue TFE;
8123 SDValue LWE;
8124 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8125 bool IsTexFail = false;
8126 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8127 return Op;
8128
8129 if (IsTexFail) {
8130 if (!DMaskLanes) {
8131 // Expecting to get an error flag since TFC is on - and dmask is 0
8132 // Force dmask to be at least 1 otherwise the instruction will fail
8133 DMask = 0x1;
8134 DMaskLanes = 1;
8135 NumVDataDwords = 1;
8136 }
8137 NumVDataDwords += 1;
8138 AdjustRetType = true;
8139 }
8140
8141 // Has something earlier tagged that the return type needs adjusting
8142 // This happens if the instruction is a load or has set TexFailCtrl flags
8143 if (AdjustRetType) {
8144 // NumVDataDwords reflects the true number of dwords required in the return type
8145 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8146 // This is a no-op load. This can be eliminated
8147 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8148 if (isa<MemSDNode>(Op))
8149 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8150 return Undef;
8151 }
8152
8153 EVT NewVT = NumVDataDwords > 1 ?
8154 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
8155 : MVT::i32;
8156
8157 ResultTypes[0] = NewVT;
8158 if (ResultTypes.size() == 3) {
8159 // Original result was aggregate type used for TexFailCtrl results
8160 // The actual instruction returns as a vector type which has now been
8161 // created. Remove the aggregate result.
8162 ResultTypes.erase(&ResultTypes[1]);
8163 }
8164 }
8165
8166 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8167 if (BaseOpcode->Atomic)
8168 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8169 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8171 return Op;
8172
8174 if (BaseOpcode->Store || BaseOpcode->Atomic)
8175 Ops.push_back(VData); // vdata
8176 if (UsePartialNSA) {
8177 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8178 Ops.push_back(VAddr);
8179 }
8180 else if (UseNSA)
8181 append_range(Ops, VAddrs);
8182 else
8183 Ops.push_back(VAddr);
8184 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
8185 if (BaseOpcode->Sampler)
8186 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
8187 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8188 if (IsGFX10Plus)
8189 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8190 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8191 Ops.push_back(Unorm);
8192 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8193 Ops.push_back(IsA16 && // r128, a16 for gfx9
8194 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8195 if (IsGFX10Plus)
8196 Ops.push_back(IsA16 ? True : False);
8197 if (!Subtarget->hasGFX90AInsts()) {
8198 Ops.push_back(TFE); //tfe
8199 } else if (TFE->getAsZExtVal()) {
8200 report_fatal_error("TFE is not supported on this GPU");
8201 }
8202 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8203 Ops.push_back(LWE); // lwe
8204 if (!IsGFX10Plus)
8205 Ops.push_back(DimInfo->DA ? True : False);
8206 if (BaseOpcode->HasD16)
8207 Ops.push_back(IsD16 ? True : False);
8208 if (isa<MemSDNode>(Op))
8209 Ops.push_back(Op.getOperand(0)); // chain
8210
8211 int NumVAddrDwords =
8212 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8213 int Opcode = -1;
8214
8215 if (IsGFX12Plus) {
8216 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8217 NumVDataDwords, NumVAddrDwords);
8218 } else if (IsGFX11Plus) {
8219 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8220 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8221 : AMDGPU::MIMGEncGfx11Default,
8222 NumVDataDwords, NumVAddrDwords);
8223 } else if (IsGFX10Plus) {
8224 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8225 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8226 : AMDGPU::MIMGEncGfx10Default,
8227 NumVDataDwords, NumVAddrDwords);
8228 } else {
8229 if (Subtarget->hasGFX90AInsts()) {
8230 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8231 NumVDataDwords, NumVAddrDwords);
8232 if (Opcode == -1)
8234 "requested image instruction is not supported on this GPU");
8235 }
8236 if (Opcode == -1 &&
8238 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8239 NumVDataDwords, NumVAddrDwords);
8240 if (Opcode == -1)
8241 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8242 NumVDataDwords, NumVAddrDwords);
8243 }
8244 if (Opcode == -1)
8245 return Op;
8246
8247 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8248 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
8249 MachineMemOperand *MemRef = MemOp->getMemOperand();
8250 DAG.setNodeMemRefs(NewNode, {MemRef});
8251 }
8252
8253 if (BaseOpcode->AtomicX2) {
8255 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8256 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8257 }
8258 if (BaseOpcode->NoReturn)
8259 return SDValue(NewNode, 0);
8260 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8261 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8262 NumVDataDwords, IsAtomicPacked16Bit, DL);
8263}
8264
8265SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8266 SDValue Offset, SDValue CachePolicy,
8267 SelectionDAG &DAG) const {
8269
8270 const DataLayout &DataLayout = DAG.getDataLayout();
8271 Align Alignment =
8273
8278 VT.getStoreSize(), Alignment);
8279
8280 if (!Offset->isDivergent()) {
8281 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8282
8283 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8284 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8285 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8286 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8287 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8288 SDValue BufferLoad =
8290 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8291 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8292 }
8293
8294 // Widen vec3 load to vec4.
8295 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8296 !Subtarget->hasScalarDwordx3Loads()) {
8297 EVT WidenedVT =
8299 auto WidenedOp = DAG.getMemIntrinsicNode(
8300 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8301 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8302 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8303 DAG.getVectorIdxConstant(0, DL));
8304 return Subvector;
8305 }
8306
8308 DAG.getVTList(VT), Ops, VT, MMO);
8309 }
8310
8311 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8312 // assume that the buffer is unswizzled.
8313 SDValue Ops[] = {
8314 DAG.getEntryNode(), // Chain
8315 Rsrc, // rsrc
8316 DAG.getConstant(0, DL, MVT::i32), // vindex
8317 {}, // voffset
8318 {}, // soffset
8319 {}, // offset
8320 CachePolicy, // cachepolicy
8321 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8322 };
8323 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8324 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8325 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8326 }
8327
8329 unsigned NumLoads = 1;
8330 MVT LoadVT = VT.getSimpleVT();
8331 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8332 assert((LoadVT.getScalarType() == MVT::i32 ||
8333 LoadVT.getScalarType() == MVT::f32));
8334
8335 if (NumElts == 8 || NumElts == 16) {
8336 NumLoads = NumElts / 4;
8337 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8338 }
8339
8340 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8341
8342 // Use the alignment to ensure that the required offsets will fit into the
8343 // immediate offsets.
8344 setBufferOffsets(Offset, DAG, &Ops[3],
8345 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8346
8347 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8348 for (unsigned i = 0; i < NumLoads; ++i) {
8349 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8350 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8351 LoadVT, MMO, DAG));
8352 }
8353
8354 if (NumElts == 8 || NumElts == 16)
8355 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8356
8357 return Loads[0];
8358}
8359
8360SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8361 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8362 if (!Subtarget->hasArchitectedSGPRs())
8363 return {};
8364 SDLoc SL(Op);
8365 MVT VT = MVT::i32;
8366 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8367 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8368 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8369}
8370
8371SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8372 unsigned Dim,
8373 const ArgDescriptor &Arg) const {
8374 SDLoc SL(Op);
8376 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8377 if (MaxID == 0)
8378 return DAG.getConstant(0, SL, MVT::i32);
8379
8380 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8381 SDLoc(DAG.getEntryNode()), Arg);
8382
8383 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8384 // masking operations anyway.
8385 //
8386 // TODO: We could assert the top bit is 0 for the source copy.
8387 if (Arg.isMasked())
8388 return Val;
8389
8390 // Preserve the known bits after expansion to a copy.
8392 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8393 DAG.getValueType(SmallVT));
8394}
8395
8396SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8397 SelectionDAG &DAG) const {
8399 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8400
8401 EVT VT = Op.getValueType();
8402 SDLoc DL(Op);
8403 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8404
8405 // TODO: Should this propagate fast-math-flags?
8406
8407 switch (IntrinsicID) {
8408 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8409 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8410 return emitNonHSAIntrinsicError(DAG, DL, VT);
8411 return getPreloadedValue(DAG, *MFI, VT,
8413 }
8414 case Intrinsic::amdgcn_dispatch_ptr:
8415 case Intrinsic::amdgcn_queue_ptr: {
8416 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8417 DiagnosticInfoUnsupported BadIntrin(
8418 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8419 DL.getDebugLoc());
8420 DAG.getContext()->diagnose(BadIntrin);
8421 return DAG.getUNDEF(VT);
8422 }
8423
8424 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8426 return getPreloadedValue(DAG, *MFI, VT, RegID);
8427 }
8428 case Intrinsic::amdgcn_implicitarg_ptr: {
8429 if (MFI->isEntryFunction())
8430 return getImplicitArgPtr(DAG, DL);
8431 return getPreloadedValue(DAG, *MFI, VT,
8433 }
8434 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8436 // This only makes sense to call in a kernel, so just lower to null.
8437 return DAG.getConstant(0, DL, VT);
8438 }
8439
8440 return getPreloadedValue(DAG, *MFI, VT,
8442 }
8443 case Intrinsic::amdgcn_dispatch_id: {
8444 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8445 }
8446 case Intrinsic::amdgcn_rcp:
8447 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8448 case Intrinsic::amdgcn_rsq:
8449 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8450 case Intrinsic::amdgcn_rsq_legacy:
8452 return emitRemovedIntrinsicError(DAG, DL, VT);
8453 return SDValue();
8454 case Intrinsic::amdgcn_rcp_legacy:
8456 return emitRemovedIntrinsicError(DAG, DL, VT);
8457 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8458 case Intrinsic::amdgcn_rsq_clamp: {
8460 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8461
8462 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8465
8466 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8467 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8468 DAG.getConstantFP(Max, DL, VT));
8469 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8470 DAG.getConstantFP(Min, DL, VT));
8471 }
8472 case Intrinsic::r600_read_ngroups_x:
8473 if (Subtarget->isAmdHsaOS())
8474 return emitNonHSAIntrinsicError(DAG, DL, VT);
8475
8476 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8478 false);
8479 case Intrinsic::r600_read_ngroups_y:
8480 if (Subtarget->isAmdHsaOS())
8481 return emitNonHSAIntrinsicError(DAG, DL, VT);
8482
8483 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8485 false);
8486 case Intrinsic::r600_read_ngroups_z:
8487 if (Subtarget->isAmdHsaOS())
8488 return emitNonHSAIntrinsicError(DAG, DL, VT);
8489
8490 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8492 false);
8493 case Intrinsic::r600_read_global_size_x:
8494 if (Subtarget->isAmdHsaOS())
8495 return emitNonHSAIntrinsicError(DAG, DL, VT);
8496
8497 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8499 Align(4), false);
8500 case Intrinsic::r600_read_global_size_y:
8501 if (Subtarget->isAmdHsaOS())
8502 return emitNonHSAIntrinsicError(DAG, DL, VT);
8503
8504 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8506 Align(4), false);
8507 case Intrinsic::r600_read_global_size_z:
8508 if (Subtarget->isAmdHsaOS())
8509 return emitNonHSAIntrinsicError(DAG, DL, VT);
8510
8511 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8513 Align(4), false);
8514 case Intrinsic::r600_read_local_size_x:
8515 if (Subtarget->isAmdHsaOS())
8516 return emitNonHSAIntrinsicError(DAG, DL, VT);
8517
8518 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8520 case Intrinsic::r600_read_local_size_y:
8521 if (Subtarget->isAmdHsaOS())
8522 return emitNonHSAIntrinsicError(DAG, DL, VT);
8523
8524 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8526 case Intrinsic::r600_read_local_size_z:
8527 if (Subtarget->isAmdHsaOS())
8528 return emitNonHSAIntrinsicError(DAG, DL, VT);
8529
8530 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8532 case Intrinsic::amdgcn_workgroup_id_x:
8533 return getPreloadedValue(DAG, *MFI, VT,
8535 case Intrinsic::amdgcn_workgroup_id_y:
8536 return getPreloadedValue(DAG, *MFI, VT,
8538 case Intrinsic::amdgcn_workgroup_id_z:
8539 return getPreloadedValue(DAG, *MFI, VT,
8541 case Intrinsic::amdgcn_wave_id:
8542 return lowerWaveID(DAG, Op);
8543 case Intrinsic::amdgcn_lds_kernel_id: {
8544 if (MFI->isEntryFunction())
8545 return getLDSKernelId(DAG, DL);
8546 return getPreloadedValue(DAG, *MFI, VT,
8548 }
8549 case Intrinsic::amdgcn_workitem_id_x:
8550 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8551 case Intrinsic::amdgcn_workitem_id_y:
8552 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8553 case Intrinsic::amdgcn_workitem_id_z:
8554 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8555 case Intrinsic::amdgcn_wavefrontsize:
8557 SDLoc(Op), MVT::i32);
8558 case Intrinsic::amdgcn_s_buffer_load: {
8559 unsigned CPol = Op.getConstantOperandVal(3);
8560 // s_buffer_load, because of how it's optimized, can't be volatile
8561 // so reject ones with the volatile bit set.
8562 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8565 return Op;
8566 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8567 DAG);
8568 }
8569 case Intrinsic::amdgcn_fdiv_fast:
8570 return lowerFDIV_FAST(Op, DAG);
8571 case Intrinsic::amdgcn_sin:
8572 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8573
8574 case Intrinsic::amdgcn_cos:
8575 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8576
8577 case Intrinsic::amdgcn_mul_u24:
8578 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8579 case Intrinsic::amdgcn_mul_i24:
8580 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8581
8582 case Intrinsic::amdgcn_log_clamp: {
8584 return SDValue();
8585
8586 return emitRemovedIntrinsicError(DAG, DL, VT);
8587 }
8588 case Intrinsic::amdgcn_fract:
8589 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8590
8591 case Intrinsic::amdgcn_class:
8592 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8593 Op.getOperand(1), Op.getOperand(2));
8594 case Intrinsic::amdgcn_div_fmas:
8595 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8596 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8597 Op.getOperand(4));
8598
8599 case Intrinsic::amdgcn_div_fixup:
8600 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8601 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8602
8603 case Intrinsic::amdgcn_div_scale: {
8604 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8605
8606 // Translate to the operands expected by the machine instruction. The
8607 // first parameter must be the same as the first instruction.
8608 SDValue Numerator = Op.getOperand(1);
8609 SDValue Denominator = Op.getOperand(2);
8610
8611 // Note this order is opposite of the machine instruction's operations,
8612 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8613 // intrinsic has the numerator as the first operand to match a normal
8614 // division operation.
8615
8616 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8617
8618 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8619 Denominator, Numerator);
8620 }
8621 case Intrinsic::amdgcn_icmp: {
8622 // There is a Pat that handles this variant, so return it as-is.
8623 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8624 Op.getConstantOperandVal(2) == 0 &&
8625 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8626 return Op;
8627 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8628 }
8629 case Intrinsic::amdgcn_fcmp: {
8630 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8631 }
8632 case Intrinsic::amdgcn_ballot:
8633 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8634 case Intrinsic::amdgcn_fmed3:
8635 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8636 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8637 case Intrinsic::amdgcn_fdot2:
8638 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8639 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8640 Op.getOperand(4));
8641 case Intrinsic::amdgcn_fmul_legacy:
8642 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8643 Op.getOperand(1), Op.getOperand(2));
8644 case Intrinsic::amdgcn_sffbh:
8645 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8646 case Intrinsic::amdgcn_sbfe:
8647 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8648 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8649 case Intrinsic::amdgcn_ubfe:
8650 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8651 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8652 case Intrinsic::amdgcn_cvt_pkrtz:
8653 case Intrinsic::amdgcn_cvt_pknorm_i16:
8654 case Intrinsic::amdgcn_cvt_pknorm_u16:
8655 case Intrinsic::amdgcn_cvt_pk_i16:
8656 case Intrinsic::amdgcn_cvt_pk_u16: {
8657 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8658 EVT VT = Op.getValueType();
8659 unsigned Opcode;
8660
8661 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8663 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8665 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8667 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8669 else
8671
8672 if (isTypeLegal(VT))
8673 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8674
8675 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8676 Op.getOperand(1), Op.getOperand(2));
8677 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8678 }
8679 case Intrinsic::amdgcn_fmad_ftz:
8680 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8681 Op.getOperand(2), Op.getOperand(3));
8682
8683 case Intrinsic::amdgcn_if_break:
8684 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8685 Op->getOperand(1), Op->getOperand(2)), 0);
8686
8687 case Intrinsic::amdgcn_groupstaticsize: {
8689 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8690 return Op;
8691
8692 const Module *M = MF.getFunction().getParent();
8693 const GlobalValue *GV =
8694 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8695 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8697 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8698 }
8699 case Intrinsic::amdgcn_is_shared:
8700 case Intrinsic::amdgcn_is_private: {
8701 SDLoc SL(Op);
8702 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8704 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8705 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8706 Op.getOperand(1));
8707
8708 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8709 DAG.getConstant(1, SL, MVT::i32));
8710 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8711 }
8712 case Intrinsic::amdgcn_perm:
8713 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8714 Op.getOperand(2), Op.getOperand(3));
8715 case Intrinsic::amdgcn_reloc_constant: {
8716 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8717 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8718 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8719 auto RelocSymbol = cast<GlobalVariable>(
8720 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8721 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8723 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8724 }
8725 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8726 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8727 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8728 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8729 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8730 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8731 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8732 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8733 if (Op.getOperand(4).getValueType() == MVT::i32)
8734 return SDValue();
8735
8736 SDLoc SL(Op);
8737 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8738 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8739 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8740 Op.getOperand(3), IndexKeyi32);
8741 }
8742 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8743 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8744 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8745 if (Op.getOperand(6).getValueType() == MVT::i32)
8746 return SDValue();
8747
8748 SDLoc SL(Op);
8749 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8750 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8751 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8752 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8753 IndexKeyi32, Op.getOperand(7)});
8754 }
8755 case Intrinsic::amdgcn_addrspacecast_nonnull:
8756 return lowerADDRSPACECAST(Op, DAG);
8757 case Intrinsic::amdgcn_readlane:
8758 case Intrinsic::amdgcn_readfirstlane:
8759 case Intrinsic::amdgcn_writelane:
8760 case Intrinsic::amdgcn_permlane16:
8761 case Intrinsic::amdgcn_permlanex16:
8762 case Intrinsic::amdgcn_permlane64:
8763 return lowerLaneOp(*this, Op.getNode(), DAG);
8764 default:
8765 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8767 return lowerImage(Op, ImageDimIntr, DAG, false);
8768
8769 return Op;
8770 }
8771}
8772
8773// On targets not supporting constant in soffset field, turn zero to
8774// SGPR_NULL to avoid generating an extra s_mov with zero.
8776 const GCNSubtarget *Subtarget) {
8777 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8778 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8779 return SOffset;
8780}
8781
8782SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8783 SelectionDAG &DAG,
8784 unsigned NewOpcode) const {
8785 SDLoc DL(Op);
8786
8787 SDValue VData = Op.getOperand(2);
8788 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8789 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8790 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8791 SDValue Ops[] = {
8792 Op.getOperand(0), // Chain
8793 VData, // vdata
8794 Rsrc, // rsrc
8795 DAG.getConstant(0, DL, MVT::i32), // vindex
8796 Offsets.first, // voffset
8797 SOffset, // soffset
8798 Offsets.second, // offset
8799 Op.getOperand(6), // cachepolicy
8800 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8801 };
8802
8803 auto *M = cast<MemSDNode>(Op);
8804
8805 EVT MemVT = VData.getValueType();
8806 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8807 M->getMemOperand());
8808}
8809
8810SDValue
8811SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8812 unsigned NewOpcode) const {
8813 SDLoc DL(Op);
8814
8815 SDValue VData = Op.getOperand(2);
8816 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8817 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8818 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8819 SDValue Ops[] = {
8820 Op.getOperand(0), // Chain
8821 VData, // vdata
8822 Rsrc, // rsrc
8823 Op.getOperand(4), // vindex
8824 Offsets.first, // voffset
8825 SOffset, // soffset
8826 Offsets.second, // offset
8827 Op.getOperand(7), // cachepolicy
8828 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8829 };
8830
8831 auto *M = cast<MemSDNode>(Op);
8832
8833 EVT MemVT = VData.getValueType();
8834 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8835 M->getMemOperand());
8836}
8837
8838SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8839 SelectionDAG &DAG) const {
8840 unsigned IntrID = Op.getConstantOperandVal(1);
8841 SDLoc DL(Op);
8842
8843 switch (IntrID) {
8844 case Intrinsic::amdgcn_ds_ordered_add:
8845 case Intrinsic::amdgcn_ds_ordered_swap: {
8847 SDValue Chain = M->getOperand(0);
8848 SDValue M0 = M->getOperand(2);
8849 SDValue Value = M->getOperand(3);
8850 unsigned IndexOperand = M->getConstantOperandVal(7);
8851 unsigned WaveRelease = M->getConstantOperandVal(8);
8852 unsigned WaveDone = M->getConstantOperandVal(9);
8853
8854 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8855 IndexOperand &= ~0x3f;
8856 unsigned CountDw = 0;
8857
8858 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8859 CountDw = (IndexOperand >> 24) & 0xf;
8860 IndexOperand &= ~(0xf << 24);
8861
8862 if (CountDw < 1 || CountDw > 4) {
8864 "ds_ordered_count: dword count must be between 1 and 4");
8865 }
8866 }
8867
8868 if (IndexOperand)
8869 report_fatal_error("ds_ordered_count: bad index operand");
8870
8871 if (WaveDone && !WaveRelease)
8872 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8873
8874 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8875 unsigned ShaderType =
8877 unsigned Offset0 = OrderedCountIndex << 2;
8878 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8879
8880 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8881 Offset1 |= (CountDw - 1) << 6;
8882
8883 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8884 Offset1 |= ShaderType << 2;
8885
8886 unsigned Offset = Offset0 | (Offset1 << 8);
8887
8888 SDValue Ops[] = {
8889 Chain,
8890 Value,
8891 DAG.getTargetConstant(Offset, DL, MVT::i16),
8892 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8893 };
8895 M->getVTList(), Ops, M->getMemoryVT(),
8896 M->getMemOperand());
8897 }
8898 case Intrinsic::amdgcn_raw_buffer_load:
8899 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8900 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8901 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8902 case Intrinsic::amdgcn_raw_buffer_load_format:
8903 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8904 const bool IsFormat =
8905 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8906 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8907
8908 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8909 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8910 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8911 SDValue Ops[] = {
8912 Op.getOperand(0), // Chain
8913 Rsrc, // rsrc
8914 DAG.getConstant(0, DL, MVT::i32), // vindex
8915 Offsets.first, // voffset
8916 SOffset, // soffset
8917 Offsets.second, // offset
8918 Op.getOperand(5), // cachepolicy, swizzled buffer
8919 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8920 };
8921
8922 auto *M = cast<MemSDNode>(Op);
8923 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8924 }
8925 case Intrinsic::amdgcn_struct_buffer_load:
8926 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8927 case Intrinsic::amdgcn_struct_buffer_load_format:
8928 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8929 const bool IsFormat =
8930 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8931 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8932
8933 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8934 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8935 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8936 SDValue Ops[] = {
8937 Op.getOperand(0), // Chain
8938 Rsrc, // rsrc
8939 Op.getOperand(3), // vindex
8940 Offsets.first, // voffset
8941 SOffset, // soffset
8942 Offsets.second, // offset
8943 Op.getOperand(6), // cachepolicy, swizzled buffer
8944 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8945 };
8946
8947 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8948 }
8949 case Intrinsic::amdgcn_raw_tbuffer_load:
8950 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8952 EVT LoadVT = Op.getValueType();
8953 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8954 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8955 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8956
8957 SDValue Ops[] = {
8958 Op.getOperand(0), // Chain
8959 Rsrc, // rsrc
8960 DAG.getConstant(0, DL, MVT::i32), // vindex
8961 Offsets.first, // voffset
8962 SOffset, // soffset
8963 Offsets.second, // offset
8964 Op.getOperand(5), // format
8965 Op.getOperand(6), // cachepolicy, swizzled buffer
8966 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8967 };
8968
8969 if (LoadVT.getScalarType() == MVT::f16)
8970 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8971 M, DAG, Ops);
8972 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8973 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8974 DAG);
8975 }
8976 case Intrinsic::amdgcn_struct_tbuffer_load:
8977 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8979 EVT LoadVT = Op.getValueType();
8980 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8981 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8982 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8983
8984 SDValue Ops[] = {
8985 Op.getOperand(0), // Chain
8986 Rsrc, // rsrc
8987 Op.getOperand(3), // vindex
8988 Offsets.first, // voffset
8989 SOffset, // soffset
8990 Offsets.second, // offset
8991 Op.getOperand(6), // format
8992 Op.getOperand(7), // cachepolicy, swizzled buffer
8993 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8994 };
8995
8996 if (LoadVT.getScalarType() == MVT::f16)
8997 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8998 M, DAG, Ops);
8999 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9000 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9001 DAG);
9002 }
9003 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9004 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9005 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9006 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9007 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9008 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9009 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9010 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9011 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9012 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9013 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9014 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9015 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9016 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9017 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9018 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9019 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9020 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9021 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9022 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9023 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9024 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9025 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9026 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9027 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9028 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9029 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9030 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9031 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9032 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9033 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9034 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9035 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9036 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9037 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9038 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9039 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9040 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9041 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9042 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9043 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9044 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9045 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9046 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9047 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9048 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9049 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9050 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9051 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9052 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9053 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9054 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9055 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9056 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9057 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9058 return lowerRawBufferAtomicIntrin(Op, DAG,
9060 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9061 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9062 return lowerStructBufferAtomicIntrin(Op, DAG,
9064 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9065 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9066 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9067 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9068 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9069 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9070 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9071 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9072 return lowerStructBufferAtomicIntrin(Op, DAG,
9074 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9076 return lowerStructBufferAtomicIntrin(Op, DAG,
9078 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9079 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9080 return lowerStructBufferAtomicIntrin(Op, DAG,
9082 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9084 return lowerStructBufferAtomicIntrin(Op, DAG,
9086 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9087 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9088 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9089 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9090 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9091 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9092 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9094 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9095 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9096 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9097 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9098 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9099 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9100 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9101 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9102 return lowerStructBufferAtomicIntrin(Op, DAG,
9104
9105 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9107 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9108 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9109 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9110 SDValue Ops[] = {
9111 Op.getOperand(0), // Chain
9112 Op.getOperand(2), // src
9113 Op.getOperand(3), // cmp
9114 Rsrc, // rsrc
9115 DAG.getConstant(0, DL, MVT::i32), // vindex
9116 Offsets.first, // voffset
9117 SOffset, // soffset
9118 Offsets.second, // offset
9119 Op.getOperand(7), // cachepolicy
9120 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9121 };
9122 EVT VT = Op.getValueType();
9123 auto *M = cast<MemSDNode>(Op);
9124
9126 Op->getVTList(), Ops, VT, M->getMemOperand());
9127 }
9128 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9129 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9130 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9131 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9132 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9133 SDValue Ops[] = {
9134 Op.getOperand(0), // Chain
9135 Op.getOperand(2), // src
9136 Op.getOperand(3), // cmp
9137 Rsrc, // rsrc
9138 Op.getOperand(5), // vindex
9139 Offsets.first, // voffset
9140 SOffset, // soffset
9141 Offsets.second, // offset
9142 Op.getOperand(8), // cachepolicy
9143 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9144 };
9145 EVT VT = Op.getValueType();
9146 auto *M = cast<MemSDNode>(Op);
9147
9149 Op->getVTList(), Ops, VT, M->getMemOperand());
9150 }
9151 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9153 SDValue NodePtr = M->getOperand(2);
9154 SDValue RayExtent = M->getOperand(3);
9155 SDValue RayOrigin = M->getOperand(4);
9156 SDValue RayDir = M->getOperand(5);
9157 SDValue RayInvDir = M->getOperand(6);
9158 SDValue TDescr = M->getOperand(7);
9159
9160 assert(NodePtr.getValueType() == MVT::i32 ||
9161 NodePtr.getValueType() == MVT::i64);
9162 assert(RayDir.getValueType() == MVT::v3f16 ||
9163 RayDir.getValueType() == MVT::v3f32);
9164
9165 if (!Subtarget->hasGFX10_AEncoding()) {
9166 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9167 return SDValue();
9168 }
9169
9170 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9171 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9172 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9173 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9174 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9175 const unsigned NumVDataDwords = 4;
9176 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9177 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9178 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9179 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9180 IsGFX12Plus;
9181 const unsigned BaseOpcodes[2][2] = {
9182 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9183 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9184 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9185 int Opcode;
9186 if (UseNSA) {
9187 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9188 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9189 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9190 : AMDGPU::MIMGEncGfx10NSA,
9191 NumVDataDwords, NumVAddrDwords);
9192 } else {
9193 assert(!IsGFX12Plus);
9194 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9195 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9196 : AMDGPU::MIMGEncGfx10Default,
9197 NumVDataDwords, NumVAddrDwords);
9198 }
9199 assert(Opcode != -1);
9200
9202
9203 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9205 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9206 if (Lanes[0].getValueSizeInBits() == 32) {
9207 for (unsigned I = 0; I < 3; ++I)
9208 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9209 } else {
9210 if (IsAligned) {
9211 Ops.push_back(
9212 DAG.getBitcast(MVT::i32,
9213 DAG.getBuildVector(MVT::v2f16, DL,
9214 { Lanes[0], Lanes[1] })));
9215 Ops.push_back(Lanes[2]);
9216 } else {
9217 SDValue Elt0 = Ops.pop_back_val();
9218 Ops.push_back(
9219 DAG.getBitcast(MVT::i32,
9220 DAG.getBuildVector(MVT::v2f16, DL,
9221 { Elt0, Lanes[0] })));
9222 Ops.push_back(
9223 DAG.getBitcast(MVT::i32,
9224 DAG.getBuildVector(MVT::v2f16, DL,
9225 { Lanes[1], Lanes[2] })));
9226 }
9227 }
9228 };
9229
9230 if (UseNSA && IsGFX11Plus) {
9231 Ops.push_back(NodePtr);
9232 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9233 Ops.push_back(RayOrigin);
9234 if (IsA16) {
9235 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9236 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9237 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9238 for (unsigned I = 0; I < 3; ++I) {
9239 MergedLanes.push_back(DAG.getBitcast(
9240 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9241 {DirLanes[I], InvDirLanes[I]})));
9242 }
9243 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9244 } else {
9245 Ops.push_back(RayDir);
9246 Ops.push_back(RayInvDir);
9247 }
9248 } else {
9249 if (Is64)
9250 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9251 2);
9252 else
9253 Ops.push_back(NodePtr);
9254
9255 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9256 packLanes(RayOrigin, true);
9257 packLanes(RayDir, true);
9258 packLanes(RayInvDir, false);
9259 }
9260
9261 if (!UseNSA) {
9262 // Build a single vector containing all the operands so far prepared.
9263 if (NumVAddrDwords > 12) {
9264 SDValue Undef = DAG.getUNDEF(MVT::i32);
9265 Ops.append(16 - Ops.size(), Undef);
9266 }
9267 assert(Ops.size() >= 8 && Ops.size() <= 12);
9268 SDValue MergedOps = DAG.getBuildVector(
9269 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9270 Ops.clear();
9271 Ops.push_back(MergedOps);
9272 }
9273
9274 Ops.push_back(TDescr);
9275 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9276 Ops.push_back(M->getChain());
9277
9278 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9279 MachineMemOperand *MemRef = M->getMemOperand();
9280 DAG.setNodeMemRefs(NewNode, {MemRef});
9281 return SDValue(NewNode, 0);
9282 }
9283 case Intrinsic::amdgcn_global_atomic_fmin:
9284 case Intrinsic::amdgcn_global_atomic_fmax:
9285 case Intrinsic::amdgcn_global_atomic_fmin_num:
9286 case Intrinsic::amdgcn_global_atomic_fmax_num:
9287 case Intrinsic::amdgcn_flat_atomic_fmin:
9288 case Intrinsic::amdgcn_flat_atomic_fmax:
9289 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9290 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9292 SDValue Ops[] = {
9293 M->getOperand(0), // Chain
9294 M->getOperand(2), // Ptr
9295 M->getOperand(3) // Value
9296 };
9297 unsigned Opcode = 0;
9298 switch (IntrID) {
9299 case Intrinsic::amdgcn_global_atomic_fmin:
9300 case Intrinsic::amdgcn_global_atomic_fmin_num:
9301 case Intrinsic::amdgcn_flat_atomic_fmin:
9302 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9303 Opcode = ISD::ATOMIC_LOAD_FMIN;
9304 break;
9305 }
9306 case Intrinsic::amdgcn_global_atomic_fmax:
9307 case Intrinsic::amdgcn_global_atomic_fmax_num:
9308 case Intrinsic::amdgcn_flat_atomic_fmax:
9309 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9310 Opcode = ISD::ATOMIC_LOAD_FMAX;
9311 break;
9312 }
9313 default:
9314 llvm_unreachable("unhandled atomic opcode");
9315 }
9316 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9317 Ops, M->getMemOperand());
9318 }
9319 case Intrinsic::amdgcn_s_get_barrier_state: {
9320 SDValue Chain = Op->getOperand(0);
9322 unsigned Opc;
9323 bool IsInlinableBarID = false;
9324 int64_t BarID;
9325
9326 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9327 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9328 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9329 }
9330
9331 if (IsInlinableBarID) {
9332 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9333 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9334 Ops.push_back(K);
9335 } else {
9336 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9337 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9338 Ops.push_back(M0Val.getValue(0));
9339 }
9340
9341 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9342 return SDValue(NewMI, 0);
9343 }
9344 default:
9345
9346 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9348 return lowerImage(Op, ImageDimIntr, DAG, true);
9349
9350 return SDValue();
9351 }
9352}
9353
9354// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9355// dwordx4 if on SI and handle TFE loads.
9356SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9357 SDVTList VTList,
9358 ArrayRef<SDValue> Ops, EVT MemVT,
9359 MachineMemOperand *MMO,
9360 SelectionDAG &DAG) const {
9361 LLVMContext &C = *DAG.getContext();
9363 EVT VT = VTList.VTs[0];
9364
9365 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9366 bool IsTFE = VTList.NumVTs == 3;
9367 if (IsTFE) {
9368 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9369 unsigned NumOpDWords = NumValueDWords + 1;
9370 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9371 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9372 MachineMemOperand *OpDWordsMMO =
9373 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9374 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9375 OpDWordsVT, OpDWordsMMO, DAG);
9377 DAG.getVectorIdxConstant(NumValueDWords, DL));
9378 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9379 SDValue ValueDWords =
9380 NumValueDWords == 1
9381 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9383 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9384 ZeroIdx);
9385 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9386 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9387 }
9388
9389 if (!Subtarget->hasDwordx3LoadStores() &&
9390 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9391 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9392 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9393 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9394 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9395 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9396 WidenedMemVT, WidenedMMO);
9398 DAG.getVectorIdxConstant(0, DL));
9399 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9400 }
9401
9402 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9403}
9404
9405SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9406 bool ImageStore) const {
9407 EVT StoreVT = VData.getValueType();
9408
9409 // No change for f16 and legal vector D16 types.
9410 if (!StoreVT.isVector())
9411 return VData;
9412
9413 SDLoc DL(VData);
9414 unsigned NumElements = StoreVT.getVectorNumElements();
9415
9416 if (Subtarget->hasUnpackedD16VMem()) {
9417 // We need to unpack the packed data to store.
9418 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9419 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9420
9421 EVT EquivStoreVT =
9422 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9423 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9424 return DAG.UnrollVectorOp(ZExt.getNode());
9425 }
9426
9427 // The sq block of gfx8.1 does not estimate register use correctly for d16
9428 // image store instructions. The data operand is computed as if it were not a
9429 // d16 image instruction.
9430 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9431 // Bitcast to i16
9432 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9433 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9434
9435 // Decompose into scalars
9437 DAG.ExtractVectorElements(IntVData, Elts);
9438
9439 // Group pairs of i16 into v2i16 and bitcast to i32
9440 SmallVector<SDValue, 4> PackedElts;
9441 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9442 SDValue Pair =
9443 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9444 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9445 PackedElts.push_back(IntPair);
9446 }
9447 if ((NumElements % 2) == 1) {
9448 // Handle v3i16
9449 unsigned I = Elts.size() / 2;
9450 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9451 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9452 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9453 PackedElts.push_back(IntPair);
9454 }
9455
9456 // Pad using UNDEF
9457 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9458
9459 // Build final vector
9460 EVT VecVT =
9461 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9462 return DAG.getBuildVector(VecVT, DL, PackedElts);
9463 }
9464
9465 if (NumElements == 3) {
9466 EVT IntStoreVT =
9468 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9469
9470 EVT WidenedStoreVT = EVT::getVectorVT(
9471 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9472 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9473 WidenedStoreVT.getStoreSizeInBits());
9474 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9475 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9476 }
9477
9478 assert(isTypeLegal(StoreVT));
9479 return VData;
9480}
9481
9482SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9483 SelectionDAG &DAG) const {
9484 SDLoc DL(Op);
9485 SDValue Chain = Op.getOperand(0);
9486 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9488
9489 switch (IntrinsicID) {
9490 case Intrinsic::amdgcn_exp_compr: {
9491 if (!Subtarget->hasCompressedExport()) {
9492 DiagnosticInfoUnsupported BadIntrin(
9494 "intrinsic not supported on subtarget", DL.getDebugLoc());
9495 DAG.getContext()->diagnose(BadIntrin);
9496 }
9497 SDValue Src0 = Op.getOperand(4);
9498 SDValue Src1 = Op.getOperand(5);
9499 // Hack around illegal type on SI by directly selecting it.
9500 if (isTypeLegal(Src0.getValueType()))
9501 return SDValue();
9502
9503 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9504 SDValue Undef = DAG.getUNDEF(MVT::f32);
9505 const SDValue Ops[] = {
9506 Op.getOperand(2), // tgt
9507 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9508 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9509 Undef, // src2
9510 Undef, // src3
9511 Op.getOperand(7), // vm
9512 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9513 Op.getOperand(3), // en
9514 Op.getOperand(0) // Chain
9515 };
9516
9517 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9518 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9519 }
9520 case Intrinsic::amdgcn_s_barrier: {
9522 if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
9523 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9524 if (WGSize <= ST.getWavefrontSize())
9525 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9526 Op.getOperand(0)), 0);
9527 }
9528
9529 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9530 if (ST.hasSplitBarriers()) {
9531 SDValue K =
9533 SDValue BarSignal =
9534 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9535 MVT::Other, K, Op.getOperand(0)),
9536 0);
9537 SDValue BarWait =
9538 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9539 BarSignal.getValue(0)),
9540 0);
9541 return BarWait;
9542 }
9543
9544 return SDValue();
9545 };
9546
9547 case Intrinsic::amdgcn_struct_tbuffer_store:
9548 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9549 SDValue VData = Op.getOperand(2);
9550 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9551 if (IsD16)
9552 VData = handleD16VData(VData, DAG);
9553 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9554 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9555 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9556 SDValue Ops[] = {
9557 Chain,
9558 VData, // vdata
9559 Rsrc, // rsrc
9560 Op.getOperand(4), // vindex
9561 Offsets.first, // voffset
9562 SOffset, // soffset
9563 Offsets.second, // offset
9564 Op.getOperand(7), // format
9565 Op.getOperand(8), // cachepolicy, swizzled buffer
9566 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9567 };
9568 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9571 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9572 M->getMemoryVT(), M->getMemOperand());
9573 }
9574
9575 case Intrinsic::amdgcn_raw_tbuffer_store:
9576 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9577 SDValue VData = Op.getOperand(2);
9578 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9579 if (IsD16)
9580 VData = handleD16VData(VData, DAG);
9581 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9582 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9583 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9584 SDValue Ops[] = {
9585 Chain,
9586 VData, // vdata
9587 Rsrc, // rsrc
9588 DAG.getConstant(0, DL, MVT::i32), // vindex
9589 Offsets.first, // voffset
9590 SOffset, // soffset
9591 Offsets.second, // offset
9592 Op.getOperand(6), // format
9593 Op.getOperand(7), // cachepolicy, swizzled buffer
9594 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9595 };
9596 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9599 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9600 M->getMemoryVT(), M->getMemOperand());
9601 }
9602
9603 case Intrinsic::amdgcn_raw_buffer_store:
9604 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9605 case Intrinsic::amdgcn_raw_buffer_store_format:
9606 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9607 const bool IsFormat =
9608 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9609 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9610
9611 SDValue VData = Op.getOperand(2);
9612 EVT VDataVT = VData.getValueType();
9613 EVT EltType = VDataVT.getScalarType();
9614 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9615 if (IsD16) {
9616 VData = handleD16VData(VData, DAG);
9617 VDataVT = VData.getValueType();
9618 }
9619
9620 if (!isTypeLegal(VDataVT)) {
9621 VData =
9622 DAG.getNode(ISD::BITCAST, DL,
9623 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9624 }
9625
9626 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9627 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9628 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9629 SDValue Ops[] = {
9630 Chain,
9631 VData,
9632 Rsrc,
9633 DAG.getConstant(0, DL, MVT::i32), // vindex
9634 Offsets.first, // voffset
9635 SOffset, // soffset
9636 Offsets.second, // offset
9637 Op.getOperand(6), // cachepolicy, swizzled buffer
9638 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9639 };
9640 unsigned Opc =
9642 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9644
9645 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9646 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9647 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9648
9649 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9650 M->getMemoryVT(), M->getMemOperand());
9651 }
9652
9653 case Intrinsic::amdgcn_struct_buffer_store:
9654 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9655 case Intrinsic::amdgcn_struct_buffer_store_format:
9656 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9657 const bool IsFormat =
9658 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9659 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9660
9661 SDValue VData = Op.getOperand(2);
9662 EVT VDataVT = VData.getValueType();
9663 EVT EltType = VDataVT.getScalarType();
9664 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9665
9666 if (IsD16) {
9667 VData = handleD16VData(VData, DAG);
9668 VDataVT = VData.getValueType();
9669 }
9670
9671 if (!isTypeLegal(VDataVT)) {
9672 VData =
9673 DAG.getNode(ISD::BITCAST, DL,
9674 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9675 }
9676
9677 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9678 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9679 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9680 SDValue Ops[] = {
9681 Chain,
9682 VData,
9683 Rsrc,
9684 Op.getOperand(4), // vindex
9685 Offsets.first, // voffset
9686 SOffset, // soffset
9687 Offsets.second, // offset
9688 Op.getOperand(7), // cachepolicy, swizzled buffer
9689 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9690 };
9691 unsigned Opc =
9693 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9695
9696 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9697 EVT VDataType = VData.getValueType().getScalarType();
9698 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9699 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9700
9701 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9702 M->getMemoryVT(), M->getMemOperand());
9703 }
9704 case Intrinsic::amdgcn_raw_buffer_load_lds:
9705 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9706 case Intrinsic::amdgcn_struct_buffer_load_lds:
9707 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9708 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9709 unsigned Opc;
9710 bool HasVIndex =
9711 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9712 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9713 unsigned OpOffset = HasVIndex ? 1 : 0;
9714 SDValue VOffset = Op.getOperand(5 + OpOffset);
9715 bool HasVOffset = !isNullConstant(VOffset);
9716 unsigned Size = Op->getConstantOperandVal(4);
9717
9718 switch (Size) {
9719 default:
9720 return SDValue();
9721 case 1:
9722 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9723 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9724 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9725 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9726 break;
9727 case 2:
9728 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9729 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9730 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9731 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9732 break;
9733 case 4:
9734 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9735 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9736 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9737 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9738 break;
9739 }
9740
9741 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9742
9744
9745 if (HasVIndex && HasVOffset)
9746 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9747 { Op.getOperand(5), // VIndex
9748 VOffset }));
9749 else if (HasVIndex)
9750 Ops.push_back(Op.getOperand(5));
9751 else if (HasVOffset)
9752 Ops.push_back(VOffset);
9753
9754 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9755 Ops.push_back(Rsrc);
9756 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9757 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9758 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9759 Ops.push_back(
9760 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9762 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9763 Ops.push_back(M0Val.getValue(0)); // Chain
9764 Ops.push_back(M0Val.getValue(1)); // Glue
9765
9766 auto *M = cast<MemSDNode>(Op);
9767 MachineMemOperand *LoadMMO = M->getMemOperand();
9768 // Don't set the offset value here because the pointer points to the base of
9769 // the buffer.
9770 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9771
9772 MachinePointerInfo StorePtrI = LoadPtrI;
9773 LoadPtrI.V = PoisonValue::get(
9777
9778 auto F = LoadMMO->getFlags() &
9780 LoadMMO =
9782 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9783
9785 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9786 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9787
9788 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9789 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9790
9791 return SDValue(Load, 0);
9792 }
9793 case Intrinsic::amdgcn_global_load_lds: {
9794 unsigned Opc;
9795 unsigned Size = Op->getConstantOperandVal(4);
9796 switch (Size) {
9797 default:
9798 return SDValue();
9799 case 1:
9800 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9801 break;
9802 case 2:
9803 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9804 break;
9805 case 4:
9806 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9807 break;
9808 }
9809
9810 auto *M = cast<MemSDNode>(Op);
9811 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9812
9814
9815 SDValue Addr = Op.getOperand(2); // Global ptr
9816 SDValue VOffset;
9817 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9818 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9819 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9820 SDValue LHS = Addr.getOperand(0);
9821 SDValue RHS = Addr.getOperand(1);
9822
9823 if (LHS->isDivergent())
9824 std::swap(LHS, RHS);
9825
9826 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9827 RHS.getOperand(0).getValueType() == MVT::i32) {
9828 // add (i64 sgpr), (zero_extend (i32 vgpr))
9829 Addr = LHS;
9830 VOffset = RHS.getOperand(0);
9831 }
9832 }
9833
9834 Ops.push_back(Addr);
9835 if (!Addr->isDivergent()) {
9836 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9837 if (!VOffset)
9838 VOffset = SDValue(
9839 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9840 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9841 Ops.push_back(VOffset);
9842 }
9843
9844 Ops.push_back(Op.getOperand(5)); // Offset
9845 Ops.push_back(Op.getOperand(6)); // CPol
9846 Ops.push_back(M0Val.getValue(0)); // Chain
9847 Ops.push_back(M0Val.getValue(1)); // Glue
9848
9849 MachineMemOperand *LoadMMO = M->getMemOperand();
9850 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9851 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9852 MachinePointerInfo StorePtrI = LoadPtrI;
9853 LoadPtrI.V = PoisonValue::get(
9857 auto F = LoadMMO->getFlags() &
9859 LoadMMO =
9861 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9863 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9864 LoadMMO->getAAInfo());
9865
9866 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9867 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9868
9869 return SDValue(Load, 0);
9870 }
9871 case Intrinsic::amdgcn_end_cf:
9872 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9873 Op->getOperand(2), Chain), 0);
9874 case Intrinsic::amdgcn_s_barrier_init:
9875 case Intrinsic::amdgcn_s_barrier_join:
9876 case Intrinsic::amdgcn_s_wakeup_barrier: {
9877 SDValue Chain = Op->getOperand(0);
9879 SDValue BarOp = Op->getOperand(2);
9880 unsigned Opc;
9881 bool IsInlinableBarID = false;
9882 int64_t BarVal;
9883
9884 if (isa<ConstantSDNode>(BarOp)) {
9885 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9886 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9887 }
9888
9889 if (IsInlinableBarID) {
9890 switch (IntrinsicID) {
9891 default:
9892 return SDValue();
9893 case Intrinsic::amdgcn_s_barrier_init:
9894 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9895 break;
9896 case Intrinsic::amdgcn_s_barrier_join:
9897 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9898 break;
9899 case Intrinsic::amdgcn_s_wakeup_barrier:
9900 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9901 break;
9902 }
9903
9904 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9905 Ops.push_back(K);
9906 } else {
9907 switch (IntrinsicID) {
9908 default:
9909 return SDValue();
9910 case Intrinsic::amdgcn_s_barrier_init:
9911 Opc = AMDGPU::S_BARRIER_INIT_M0;
9912 break;
9913 case Intrinsic::amdgcn_s_barrier_join:
9914 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9915 break;
9916 case Intrinsic::amdgcn_s_wakeup_barrier:
9917 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9918 break;
9919 }
9920 }
9921
9922 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9923 SDValue M0Val;
9924 // Member count will be read from M0[16:22]
9925 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9926 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9927
9928 if (!IsInlinableBarID) {
9929 // If reference to barrier id is not an inline constant then it must be
9930 // referenced with M0[4:0]. Perform an OR with the member count to
9931 // include it in M0.
9932 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9933 Op.getOperand(2), M0Val),
9934 0);
9935 }
9936 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9937 } else if (!IsInlinableBarID) {
9938 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9939 }
9940
9941 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9942 return SDValue(NewMI, 0);
9943 }
9944 default: {
9945 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9947 return lowerImage(Op, ImageDimIntr, DAG, true);
9948
9949 return Op;
9950 }
9951 }
9952}
9953
9954// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9955// offset (the offset that is included in bounds checking and swizzling, to be
9956// split between the instruction's voffset and immoffset fields) and soffset
9957// (the offset that is excluded from bounds checking and swizzling, to go in
9958// the instruction's soffset field). This function takes the first kind of
9959// offset and figures out how to split it between voffset and immoffset.
9960std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9961 SDValue Offset, SelectionDAG &DAG) const {
9962 SDLoc DL(Offset);
9963 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
9964 SDValue N0 = Offset;
9965 ConstantSDNode *C1 = nullptr;
9966
9967 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9968 N0 = SDValue();
9969 else if (DAG.isBaseWithConstantOffset(N0)) {
9970 C1 = cast<ConstantSDNode>(N0.getOperand(1));
9971 N0 = N0.getOperand(0);
9972 }
9973
9974 if (C1) {
9975 unsigned ImmOffset = C1->getZExtValue();
9976 // If the immediate value is too big for the immoffset field, put only bits
9977 // that would normally fit in the immoffset field. The remaining value that
9978 // is copied/added for the voffset field is a large power of 2, and it
9979 // stands more chance of being CSEd with the copy/add for another similar
9980 // load/store.
9981 // However, do not do that rounding down if that is a negative
9982 // number, as it appears to be illegal to have a negative offset in the
9983 // vgpr, even if adding the immediate offset makes it positive.
9984 unsigned Overflow = ImmOffset & ~MaxImm;
9985 ImmOffset -= Overflow;
9986 if ((int32_t)Overflow < 0) {
9987 Overflow += ImmOffset;
9988 ImmOffset = 0;
9989 }
9990 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
9991 if (Overflow) {
9992 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
9993 if (!N0)
9994 N0 = OverflowVal;
9995 else {
9996 SDValue Ops[] = { N0, OverflowVal };
9997 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
9998 }
9999 }
10000 }
10001 if (!N0)
10002 N0 = DAG.getConstant(0, DL, MVT::i32);
10003 if (!C1)
10004 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10005 return {N0, SDValue(C1, 0)};
10006}
10007
10008// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10009// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10010// pointed to by Offsets.
10011void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10012 SelectionDAG &DAG, SDValue *Offsets,
10013 Align Alignment) const {
10015 SDLoc DL(CombinedOffset);
10016 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10017 uint32_t Imm = C->getZExtValue();
10018 uint32_t SOffset, ImmOffset;
10019 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10020 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10021 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10022 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10023 return;
10024 }
10025 }
10026 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10027 SDValue N0 = CombinedOffset.getOperand(0);
10028 SDValue N1 = CombinedOffset.getOperand(1);
10029 uint32_t SOffset, ImmOffset;
10030 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10031 if (Offset >= 0 &&
10032 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10033 Offsets[0] = N0;
10034 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10035 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10036 return;
10037 }
10038 }
10039
10040 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10041 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10042 : DAG.getConstant(0, DL, MVT::i32);
10043
10044 Offsets[0] = CombinedOffset;
10045 Offsets[1] = SOffsetZero;
10046 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10047}
10048
10049SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10050 SelectionDAG &DAG) const {
10051 if (!MaybePointer.getValueType().isScalarInteger())
10052 return MaybePointer;
10053
10054 SDLoc DL(MaybePointer);
10055
10056 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10057 return Rsrc;
10058}
10059
10060// Wrap a global or flat pointer into a buffer intrinsic using the flags
10061// specified in the intrinsic.
10062SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10063 SelectionDAG &DAG) const {
10064 SDLoc Loc(Op);
10065
10066 SDValue Pointer = Op->getOperand(1);
10067 SDValue Stride = Op->getOperand(2);
10068 SDValue NumRecords = Op->getOperand(3);
10069 SDValue Flags = Op->getOperand(4);
10070
10071 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10072 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10073 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10074 std::optional<uint32_t> ConstStride = std::nullopt;
10075 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10076 ConstStride = ConstNode->getZExtValue();
10077
10078 SDValue NewHighHalf = Masked;
10079 if (!ConstStride || *ConstStride != 0) {
10080 SDValue ShiftedStride;
10081 if (ConstStride) {
10082 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10083 } else {
10084 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10085 ShiftedStride =
10086 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10087 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10088 }
10089 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10090 }
10091
10092 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10093 NewHighHalf, NumRecords, Flags);
10094 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10095 return RsrcPtr;
10096}
10097
10098// Handle 8 bit and 16 bit buffer loads
10099SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10100 EVT LoadVT, SDLoc DL,
10102 MachineMemOperand *MMO,
10103 bool IsTFE) const {
10104 EVT IntVT = LoadVT.changeTypeToInteger();
10105
10106 if (IsTFE) {
10107 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10111 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10112 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10113 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10115 DAG.getConstant(1, DL, MVT::i32));
10116 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10117 DAG.getConstant(0, DL, MVT::i32));
10118 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10119 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10120 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10121 }
10122
10123 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10125
10126 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10127 SDValue BufferLoad =
10128 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10129 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10130 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10131
10132 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10133}
10134
10135// Handle 8 bit and 16 bit buffer stores
10136SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10137 EVT VDataType, SDLoc DL,
10138 SDValue Ops[],
10139 MemSDNode *M) const {
10140 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10141 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10142
10143 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10144 Ops[1] = BufferStoreExt;
10145 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10146 AMDGPUISD::BUFFER_STORE_SHORT;
10147 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10148 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10149 M->getMemOperand());
10150}
10151
10153 ISD::LoadExtType ExtType, SDValue Op,
10154 const SDLoc &SL, EVT VT) {
10155 if (VT.bitsLT(Op.getValueType()))
10156 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10157
10158 switch (ExtType) {
10159 case ISD::SEXTLOAD:
10160 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10161 case ISD::ZEXTLOAD:
10162 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10163 case ISD::EXTLOAD:
10164 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10165 case ISD::NON_EXTLOAD:
10166 return Op;
10167 }
10168
10169 llvm_unreachable("invalid ext type");
10170}
10171
10172// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10173// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10174SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10175 SelectionDAG &DAG = DCI.DAG;
10176 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10177 return SDValue();
10178
10179 // FIXME: Constant loads should all be marked invariant.
10180 unsigned AS = Ld->getAddressSpace();
10181 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10183 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10184 return SDValue();
10185
10186 // Don't do this early, since it may interfere with adjacent load merging for
10187 // illegal types. We can avoid losing alignment information for exotic types
10188 // pre-legalize.
10189 EVT MemVT = Ld->getMemoryVT();
10190 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10191 MemVT.getSizeInBits() >= 32)
10192 return SDValue();
10193
10194 SDLoc SL(Ld);
10195
10196 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10197 "unexpected vector extload");
10198
10199 // TODO: Drop only high part of range.
10200 SDValue Ptr = Ld->getBasePtr();
10201 SDValue NewLoad = DAG.getLoad(
10202 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10203 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10204 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10205 nullptr); // Drop ranges
10206
10207 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10208 if (MemVT.isFloatingPoint()) {
10210 "unexpected fp extload");
10211 TruncVT = MemVT.changeTypeToInteger();
10212 }
10213
10214 SDValue Cvt = NewLoad;
10215 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10216 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10217 DAG.getValueType(TruncVT));
10218 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10220 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10221 } else {
10223 }
10224
10225 EVT VT = Ld->getValueType(0);
10226 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10227
10228 DCI.AddToWorklist(Cvt.getNode());
10229
10230 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10231 // the appropriate extension from the 32-bit load.
10232 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10233 DCI.AddToWorklist(Cvt.getNode());
10234
10235 // Handle conversion back to floating point if necessary.
10236 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10237
10238 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10239}
10240
10242 const SIMachineFunctionInfo &Info) {
10243 // TODO: Should check if the address can definitely not access stack.
10244 if (Info.isEntryFunction())
10245 return Info.getUserSGPRInfo().hasFlatScratchInit();
10246 return true;
10247}
10248
10249SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10250 SDLoc DL(Op);
10252 ISD::LoadExtType ExtType = Load->getExtensionType();
10253 EVT MemVT = Load->getMemoryVT();
10254
10255 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10256 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10257 return SDValue();
10258
10259 // FIXME: Copied from PPC
10260 // First, load into 32 bits, then truncate to 1 bit.
10261
10262 SDValue Chain = Load->getChain();
10263 SDValue BasePtr = Load->getBasePtr();
10264 MachineMemOperand *MMO = Load->getMemOperand();
10265
10266 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10267
10268 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10269 BasePtr, RealMemVT, MMO);
10270
10271 if (!MemVT.isVector()) {
10272 SDValue Ops[] = {
10273 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10274 NewLD.getValue(1)
10275 };
10276
10277 return DAG.getMergeValues(Ops, DL);
10278 }
10279
10281 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10282 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10283 DAG.getConstant(I, DL, MVT::i32));
10284
10285 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10286 }
10287
10288 SDValue Ops[] = {
10289 DAG.getBuildVector(MemVT, DL, Elts),
10290 NewLD.getValue(1)
10291 };
10292
10293 return DAG.getMergeValues(Ops, DL);
10294 }
10295
10296 if (!MemVT.isVector())
10297 return SDValue();
10298
10299 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10300 "Custom lowering for non-i32 vectors hasn't been implemented.");
10301
10302 Align Alignment = Load->getAlign();
10303 unsigned AS = Load->getAddressSpace();
10304 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10305 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10306 return SplitVectorLoad(Op, DAG);
10307 }
10308
10311 // If there is a possibility that flat instruction access scratch memory
10312 // then we need to use the same legalization rules we use for private.
10313 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10315 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10317
10318 unsigned NumElements = MemVT.getVectorNumElements();
10319
10320 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10322 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10323 if (MemVT.isPow2VectorType() ||
10324 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10325 return SDValue();
10326 return WidenOrSplitVectorLoad(Op, DAG);
10327 }
10328 // Non-uniform loads will be selected to MUBUF instructions, so they
10329 // have the same legalization requirements as global and private
10330 // loads.
10331 //
10332 }
10333
10334 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10337 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10338 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10339 Alignment >= Align(4) && NumElements < 32) {
10340 if (MemVT.isPow2VectorType() ||
10341 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10342 return SDValue();
10343 return WidenOrSplitVectorLoad(Op, DAG);
10344 }
10345 // Non-uniform loads will be selected to MUBUF instructions, so they
10346 // have the same legalization requirements as global and private
10347 // loads.
10348 //
10349 }
10350 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10353 AS == AMDGPUAS::FLAT_ADDRESS) {
10354 if (NumElements > 4)
10355 return SplitVectorLoad(Op, DAG);
10356 // v3 loads not supported on SI.
10357 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10358 return WidenOrSplitVectorLoad(Op, DAG);
10359
10360 // v3 and v4 loads are supported for private and global memory.
10361 return SDValue();
10362 }
10363 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10364 // Depending on the setting of the private_element_size field in the
10365 // resource descriptor, we can only make private accesses up to a certain
10366 // size.
10367 switch (Subtarget->getMaxPrivateElementSize()) {
10368 case 4: {
10369 SDValue Ops[2];
10370 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10371 return DAG.getMergeValues(Ops, DL);
10372 }
10373 case 8:
10374 if (NumElements > 2)
10375 return SplitVectorLoad(Op, DAG);
10376 return SDValue();
10377 case 16:
10378 // Same as global/flat
10379 if (NumElements > 4)
10380 return SplitVectorLoad(Op, DAG);
10381 // v3 loads not supported on SI.
10382 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10383 return WidenOrSplitVectorLoad(Op, DAG);
10384
10385 return SDValue();
10386 default:
10387 llvm_unreachable("unsupported private_element_size");
10388 }
10389 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10390 unsigned Fast = 0;
10391 auto Flags = Load->getMemOperand()->getFlags();
10393 Load->getAlign(), Flags, &Fast) &&
10394 Fast > 1)
10395 return SDValue();
10396
10397 if (MemVT.isVector())
10398 return SplitVectorLoad(Op, DAG);
10399 }
10400
10402 MemVT, *Load->getMemOperand())) {
10403 SDValue Ops[2];
10404 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10405 return DAG.getMergeValues(Ops, DL);
10406 }
10407
10408 return SDValue();
10409}
10410
10411SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10412 EVT VT = Op.getValueType();
10413 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10414 VT.getSizeInBits() == 512)
10415 return splitTernaryVectorOp(Op, DAG);
10416
10417 assert(VT.getSizeInBits() == 64);
10418
10419 SDLoc DL(Op);
10420 SDValue Cond = Op.getOperand(0);
10421
10422 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10423 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10424
10425 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10426 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10427
10428 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10429 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10430
10431 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10432
10433 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10434 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10435
10436 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10437
10438 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10439 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10440}
10441
10442// Catch division cases where we can use shortcuts with rcp and rsq
10443// instructions.
10444SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10445 SelectionDAG &DAG) const {
10446 SDLoc SL(Op);
10447 SDValue LHS = Op.getOperand(0);
10448 SDValue RHS = Op.getOperand(1);
10449 EVT VT = Op.getValueType();
10450 const SDNodeFlags Flags = Op->getFlags();
10451
10452 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10454
10455 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10456 // Without !fpmath accuracy information, we can't do more because we don't
10457 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10458 // f16 is always accurate enough
10459 if (!AllowInaccurateRcp && VT != MVT::f16)
10460 return SDValue();
10461
10462 if (CLHS->isExactlyValue(1.0)) {
10463 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10464 // the CI documentation has a worst case error of 1 ulp.
10465 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10466 // use it as long as we aren't trying to use denormals.
10467 //
10468 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10469
10470 // 1.0 / sqrt(x) -> rsq(x)
10471
10472 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10473 // error seems really high at 2^29 ULP.
10474 // 1.0 / x -> rcp(x)
10475 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10476 }
10477
10478 // Same as for 1.0, but expand the sign out of the constant.
10479 if (CLHS->isExactlyValue(-1.0)) {
10480 // -1.0 / x -> rcp (fneg x)
10481 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10482 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10483 }
10484 }
10485
10486 // For f16 require afn or arcp.
10487 // For f32 require afn.
10488 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10489 return SDValue();
10490
10491 // Turn into multiply by the reciprocal.
10492 // x / y -> x * (1.0 / y)
10493 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10494 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10495}
10496
10497SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10498 SelectionDAG &DAG) const {
10499 SDLoc SL(Op);
10500 SDValue X = Op.getOperand(0);
10501 SDValue Y = Op.getOperand(1);
10502 EVT VT = Op.getValueType();
10503 const SDNodeFlags Flags = Op->getFlags();
10504
10505 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10507 if (!AllowInaccurateDiv)
10508 return SDValue();
10509
10510 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10511 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10512
10513 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10514 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10515
10516 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10517 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10518 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10519 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10520 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10521 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10522}
10523
10524static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10525 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10526 SDNodeFlags Flags) {
10527 if (GlueChain->getNumValues() <= 1) {
10528 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10529 }
10530
10531 assert(GlueChain->getNumValues() == 3);
10532
10533 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10534 switch (Opcode) {
10535 default: llvm_unreachable("no chain equivalent for opcode");
10536 case ISD::FMUL:
10537 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10538 break;
10539 }
10540
10541 return DAG.getNode(Opcode, SL, VTList,
10542 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10543 Flags);
10544}
10545
10546static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10547 EVT VT, SDValue A, SDValue B, SDValue C,
10548 SDValue GlueChain, SDNodeFlags Flags) {
10549 if (GlueChain->getNumValues() <= 1) {
10550 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10551 }
10552
10553 assert(GlueChain->getNumValues() == 3);
10554
10555 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10556 switch (Opcode) {
10557 default: llvm_unreachable("no chain equivalent for opcode");
10558 case ISD::FMA:
10559 Opcode = AMDGPUISD::FMA_W_CHAIN;
10560 break;
10561 }
10562
10563 return DAG.getNode(Opcode, SL, VTList,
10564 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10565 Flags);
10566}
10567
10568SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10569 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10570 return FastLowered;
10571
10572 SDLoc SL(Op);
10573 SDValue Src0 = Op.getOperand(0);
10574 SDValue Src1 = Op.getOperand(1);
10575
10576 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10577 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10578
10579 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10580 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10581
10582 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10583 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10584
10585 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10586}
10587
10588// Faster 2.5 ULP division that does not support denormals.
10589SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10590 SDNodeFlags Flags = Op->getFlags();
10591 SDLoc SL(Op);
10592 SDValue LHS = Op.getOperand(1);
10593 SDValue RHS = Op.getOperand(2);
10594
10595 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10596
10597 const APFloat K0Val(0x1p+96f);
10598 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10599
10600 const APFloat K1Val(0x1p-32f);
10601 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10602
10603 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10604
10605 EVT SetCCVT =
10606 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10607
10608 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10609
10610 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10611
10612 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10613
10614 // rcp does not support denormals.
10615 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10616
10617 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10618
10619 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10620}
10621
10622// Returns immediate value for setting the F32 denorm mode when using the
10623// S_DENORM_MODE instruction.
10625 const SIMachineFunctionInfo *Info,
10626 const GCNSubtarget *ST) {
10627 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10628 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10629 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10630 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10631}
10632
10633SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10634 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10635 return FastLowered;
10636
10637 // The selection matcher assumes anything with a chain selecting to a
10638 // mayRaiseFPException machine instruction. Since we're introducing a chain
10639 // here, we need to explicitly report nofpexcept for the regular fdiv
10640 // lowering.
10641 SDNodeFlags Flags = Op->getFlags();
10642 Flags.setNoFPExcept(true);
10643
10644 SDLoc SL(Op);
10645 SDValue LHS = Op.getOperand(0);
10646 SDValue RHS = Op.getOperand(1);
10647
10648 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10649
10650 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10651
10652 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10653 {RHS, RHS, LHS}, Flags);
10654 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10655 {LHS, RHS, LHS}, Flags);
10656
10657 // Denominator is scaled to not be denormal, so using rcp is ok.
10658 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10659 DenominatorScaled, Flags);
10660 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10661 DenominatorScaled, Flags);
10662
10663 using namespace AMDGPU::Hwreg;
10664 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10665 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10666
10667 const MachineFunction &MF = DAG.getMachineFunction();
10669 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10670
10671 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10672 const bool HasDynamicDenormals =
10673 (DenormMode.Input == DenormalMode::Dynamic) ||
10674 (DenormMode.Output == DenormalMode::Dynamic);
10675
10676 SDValue SavedDenormMode;
10677
10678 if (!PreservesDenormals) {
10679 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10680 // lowering. The chain dependence is insufficient, and we need glue. We do
10681 // not need the glue variants in a strictfp function.
10682
10683 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10684
10685 SDValue Glue = DAG.getEntryNode();
10686 if (HasDynamicDenormals) {
10687 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10688 DAG.getVTList(MVT::i32, MVT::Glue),
10689 {BitField, Glue});
10690 SavedDenormMode = SDValue(GetReg, 0);
10691
10692 Glue = DAG.getMergeValues(
10693 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10694 }
10695
10696 SDNode *EnableDenorm;
10697 if (Subtarget->hasDenormModeInst()) {
10698 const SDValue EnableDenormValue =
10699 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10700
10701 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10702 EnableDenormValue)
10703 .getNode();
10704 } else {
10705 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10706 SL, MVT::i32);
10707 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10708 {EnableDenormValue, BitField, Glue});
10709 }
10710
10711 SDValue Ops[3] = {
10712 NegDivScale0,
10713 SDValue(EnableDenorm, 0),
10714 SDValue(EnableDenorm, 1)
10715 };
10716
10717 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10718 }
10719
10720 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10721 ApproxRcp, One, NegDivScale0, Flags);
10722
10723 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10724 ApproxRcp, Fma0, Flags);
10725
10726 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10727 Fma1, Fma1, Flags);
10728
10729 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10730 NumeratorScaled, Mul, Flags);
10731
10732 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10733 Fma2, Fma1, Mul, Fma2, Flags);
10734
10735 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10736 NumeratorScaled, Fma3, Flags);
10737
10738 if (!PreservesDenormals) {
10739 SDNode *DisableDenorm;
10740 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10741 const SDValue DisableDenormValue = getSPDenormModeValue(
10742 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10743
10744 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10745 Fma4.getValue(1), DisableDenormValue,
10746 Fma4.getValue(2)).getNode();
10747 } else {
10748 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10749 const SDValue DisableDenormValue =
10750 HasDynamicDenormals
10751 ? SavedDenormMode
10752 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10753
10754 DisableDenorm = DAG.getMachineNode(
10755 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10756 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10757 }
10758
10759 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10760 SDValue(DisableDenorm, 0), DAG.getRoot());
10761 DAG.setRoot(OutputChain);
10762 }
10763
10764 SDValue Scale = NumeratorScaled.getValue(1);
10765 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10766 {Fma4, Fma1, Fma3, Scale}, Flags);
10767
10768 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10769}
10770
10771SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10772 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10773 return FastLowered;
10774
10775 SDLoc SL(Op);
10776 SDValue X = Op.getOperand(0);
10777 SDValue Y = Op.getOperand(1);
10778
10779 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10780
10781 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10782
10783 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10784
10785 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10786
10787 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10788
10789 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10790
10791 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10792
10793 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10794
10795 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10796
10797 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10798 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10799
10800 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10801 NegDivScale0, Mul, DivScale1);
10802
10803 SDValue Scale;
10804
10805 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10806 // Workaround a hardware bug on SI where the condition output from div_scale
10807 // is not usable.
10808
10809 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10810
10811 // Figure out if the scale to use for div_fmas.
10812 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10813 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10814 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10815 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10816
10817 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10818 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10819
10820 SDValue Scale0Hi
10821 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10822 SDValue Scale1Hi
10823 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10824
10825 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10826 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10827 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10828 } else {
10829 Scale = DivScale1.getValue(1);
10830 }
10831
10832 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10833 Fma4, Fma3, Mul, Scale);
10834
10835 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10836}
10837
10838SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10839 EVT VT = Op.getValueType();
10840
10841 if (VT == MVT::f32)
10842 return LowerFDIV32(Op, DAG);
10843
10844 if (VT == MVT::f64)
10845 return LowerFDIV64(Op, DAG);
10846
10847 if (VT == MVT::f16)
10848 return LowerFDIV16(Op, DAG);
10849
10850 llvm_unreachable("Unexpected type for fdiv");
10851}
10852
10853SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10854 SDLoc dl(Op);
10855 SDValue Val = Op.getOperand(0);
10856 EVT VT = Val.getValueType();
10857 EVT ResultExpVT = Op->getValueType(1);
10858 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10859
10860 SDValue Mant = DAG.getNode(
10862 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10863
10864 SDValue Exp = DAG.getNode(
10865 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10866 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10867
10868 if (Subtarget->hasFractBug()) {
10869 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10870 SDValue Inf = DAG.getConstantFP(
10872
10873 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10874 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10875 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10876 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10877 }
10878
10879 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10880 return DAG.getMergeValues({Mant, CastExp}, dl);
10881}
10882
10883SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10884 SDLoc DL(Op);
10886 EVT VT = Store->getMemoryVT();
10887
10888 if (VT == MVT::i1) {
10889 return DAG.getTruncStore(Store->getChain(), DL,
10890 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10891 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10892 }
10893
10894 assert(VT.isVector() &&
10895 Store->getValue().getValueType().getScalarType() == MVT::i32);
10896
10897 unsigned AS = Store->getAddressSpace();
10898 if (Subtarget->hasLDSMisalignedBug() &&
10899 AS == AMDGPUAS::FLAT_ADDRESS &&
10900 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10901 return SplitVectorStore(Op, DAG);
10902 }
10903
10906 // If there is a possibility that flat instruction access scratch memory
10907 // then we need to use the same legalization rules we use for private.
10908 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10910 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10912
10913 unsigned NumElements = VT.getVectorNumElements();
10914 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10915 AS == AMDGPUAS::FLAT_ADDRESS) {
10916 if (NumElements > 4)
10917 return SplitVectorStore(Op, DAG);
10918 // v3 stores not supported on SI.
10919 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10920 return SplitVectorStore(Op, DAG);
10921
10923 VT, *Store->getMemOperand()))
10924 return expandUnalignedStore(Store, DAG);
10925
10926 return SDValue();
10927 }
10928 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10929 switch (Subtarget->getMaxPrivateElementSize()) {
10930 case 4:
10931 return scalarizeVectorStore(Store, DAG);
10932 case 8:
10933 if (NumElements > 2)
10934 return SplitVectorStore(Op, DAG);
10935 return SDValue();
10936 case 16:
10937 if (NumElements > 4 ||
10938 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10939 return SplitVectorStore(Op, DAG);
10940 return SDValue();
10941 default:
10942 llvm_unreachable("unsupported private_element_size");
10943 }
10944 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10945 unsigned Fast = 0;
10946 auto Flags = Store->getMemOperand()->getFlags();
10948 Store->getAlign(), Flags, &Fast) &&
10949 Fast > 1)
10950 return SDValue();
10951
10952 if (VT.isVector())
10953 return SplitVectorStore(Op, DAG);
10954
10955 return expandUnalignedStore(Store, DAG);
10956 }
10957
10958 // Probably an invalid store. If so we'll end up emitting a selection error.
10959 return SDValue();
10960}
10961
10962// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10963SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10964 SDLoc SL(Op);
10965 assert(!Subtarget->has16BitInsts());
10966 SDNodeFlags Flags = Op->getFlags();
10967 SDValue Ext =
10968 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10969
10970 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10971 SDValue Sqrt =
10972 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10973
10974 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10975 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10976}
10977
10978SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10979 SDLoc DL(Op);
10980 SDNodeFlags Flags = Op->getFlags();
10981 MVT VT = Op.getValueType().getSimpleVT();
10982 const SDValue X = Op.getOperand(0);
10983
10984 if (allowApproxFunc(DAG, Flags)) {
10985 // Instruction is 1ulp but ignores denormals.
10986 return DAG.getNode(
10988 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
10989 }
10990
10991 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
10992 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
10993
10994 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
10995
10996 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
10997
10998 SDValue SqrtX =
10999 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11000
11001 SDValue SqrtS;
11002 if (needsDenormHandlingF32(DAG, X, Flags)) {
11003 SDValue SqrtID =
11004 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11005 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11006
11007 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11008 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11009 DAG.getConstant(-1, DL, MVT::i32));
11010 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11011
11012 SDValue NegSqrtSNextDown =
11013 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11014
11015 SDValue SqrtVP =
11016 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11017
11018 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11019 DAG.getConstant(1, DL, MVT::i32));
11020 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11021
11022 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11023 SDValue SqrtVS =
11024 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11025
11026 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11027 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11028
11029 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11030 Flags);
11031
11032 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11033 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11034 Flags);
11035 } else {
11036 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11037
11038 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11039
11040 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11041 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11042 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11043
11044 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11045 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11046 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11047
11048 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11049 SDValue SqrtD =
11050 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11051 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11052 }
11053
11054 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11055
11056 SDValue ScaledDown =
11057 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11058
11059 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11060 SDValue IsZeroOrInf =
11061 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11062 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11063
11064 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11065}
11066
11067SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11068 // For double type, the SQRT and RSQ instructions don't have required
11069 // precision, we apply Goldschmidt's algorithm to improve the result:
11070 //
11071 // y0 = rsq(x)
11072 // g0 = x * y0
11073 // h0 = 0.5 * y0
11074 //
11075 // r0 = 0.5 - h0 * g0
11076 // g1 = g0 * r0 + g0
11077 // h1 = h0 * r0 + h0
11078 //
11079 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11080 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11081 // h2 = h1 * r1 + h1
11082 //
11083 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11084 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11085 //
11086 // sqrt(x) = g3
11087
11088 SDNodeFlags Flags = Op->getFlags();
11089
11090 SDLoc DL(Op);
11091
11092 SDValue X = Op.getOperand(0);
11093 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11094
11095 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11096
11097 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11098
11099 // Scale up input if it is too small.
11100 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11101 SDValue ScaleUp =
11102 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11103 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11104
11105 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11106
11107 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11108
11109 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11110 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11111
11112 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11113 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11114
11115 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11116
11117 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11118
11119 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11120 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11121
11122 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11123
11124 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11125 SDValue SqrtD1 =
11126 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11127
11128 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11129
11130 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11131 SDValue ScaleDown =
11132 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11133 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11134
11135 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11136 // with finite only or nsz because rsq(+/-0) = +/-inf
11137
11138 // TODO: Check for DAZ and expand to subnormals
11139 SDValue IsZeroOrInf =
11140 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11141 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11142
11143 // If x is +INF, +0, or -0, use its original value
11144 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11145 Flags);
11146}
11147
11148SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11149 SDLoc DL(Op);
11150 EVT VT = Op.getValueType();
11151 SDValue Arg = Op.getOperand(0);
11152 SDValue TrigVal;
11153
11154 // Propagate fast-math flags so that the multiply we introduce can be folded
11155 // if Arg is already the result of a multiply by constant.
11156 auto Flags = Op->getFlags();
11157
11158 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11159
11160 if (Subtarget->hasTrigReducedRange()) {
11161 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11162 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11163 } else {
11164 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11165 }
11166
11167 switch (Op.getOpcode()) {
11168 case ISD::FCOS:
11169 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11170 case ISD::FSIN:
11171 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11172 default:
11173 llvm_unreachable("Wrong trig opcode");
11174 }
11175}
11176
11177SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11178 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11179 assert(AtomicNode->isCompareAndSwap());
11180 unsigned AS = AtomicNode->getAddressSpace();
11181
11182 // No custom lowering required for local address space
11184 return Op;
11185
11186 // Non-local address space requires custom lowering for atomic compare
11187 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11188 SDLoc DL(Op);
11189 SDValue ChainIn = Op.getOperand(0);
11190 SDValue Addr = Op.getOperand(1);
11191 SDValue Old = Op.getOperand(2);
11192 SDValue New = Op.getOperand(3);
11193 EVT VT = Op.getValueType();
11194 MVT SimpleVT = VT.getSimpleVT();
11195 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11196
11197 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11198 SDValue Ops[] = { ChainIn, Addr, NewOld };
11199
11200 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11201 Ops, VT, AtomicNode->getMemOperand());
11202}
11203
11204//===----------------------------------------------------------------------===//
11205// Custom DAG optimizations
11206//===----------------------------------------------------------------------===//
11207
11208SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11209 DAGCombinerInfo &DCI) const {
11210 EVT VT = N->getValueType(0);
11211 EVT ScalarVT = VT.getScalarType();
11212 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11213 return SDValue();
11214
11215 SelectionDAG &DAG = DCI.DAG;
11216 SDLoc DL(N);
11217
11218 SDValue Src = N->getOperand(0);
11219 EVT SrcVT = Src.getValueType();
11220
11221 // TODO: We could try to match extracting the higher bytes, which would be
11222 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11223 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11224 // about in practice.
11225 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11226 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11227 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11228 DCI.AddToWorklist(Cvt.getNode());
11229
11230 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11231 if (ScalarVT != MVT::f32) {
11232 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11233 DAG.getTargetConstant(0, DL, MVT::i32));
11234 }
11235 return Cvt;
11236 }
11237 }
11238
11239 return SDValue();
11240}
11241
11242SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11243 DAGCombinerInfo &DCI) const {
11244 SDValue MagnitudeOp = N->getOperand(0);
11245 SDValue SignOp = N->getOperand(1);
11246 SelectionDAG &DAG = DCI.DAG;
11247 SDLoc DL(N);
11248
11249 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11250 // lower half with a copy.
11251 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11252 if (MagnitudeOp.getValueType() == MVT::f64) {
11253 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11254 SDValue MagLo =
11255 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11256 DAG.getConstant(0, DL, MVT::i32));
11257 SDValue MagHi =
11258 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11259 DAG.getConstant(1, DL, MVT::i32));
11260
11261 SDValue HiOp =
11262 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11263
11264 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11265
11266 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11267 }
11268
11269 if (SignOp.getValueType() != MVT::f64)
11270 return SDValue();
11271
11272 // Reduce width of sign operand, we only need the highest bit.
11273 //
11274 // fcopysign f64:x, f64:y ->
11275 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11276 // TODO: In some cases it might make sense to go all the way to f16.
11277 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11278 SDValue SignAsF32 =
11279 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11280 DAG.getConstant(1, DL, MVT::i32));
11281
11282 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11283 SignAsF32);
11284}
11285
11286// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11287// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11288// bits
11289
11290// This is a variant of
11291// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11292//
11293// The normal DAG combiner will do this, but only if the add has one use since
11294// that would increase the number of instructions.
11295//
11296// This prevents us from seeing a constant offset that can be folded into a
11297// memory instruction's addressing mode. If we know the resulting add offset of
11298// a pointer can be folded into an addressing offset, we can replace the pointer
11299// operand with the add of new constant offset. This eliminates one of the uses,
11300// and may allow the remaining use to also be simplified.
11301//
11302SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11303 unsigned AddrSpace,
11304 EVT MemVT,
11305 DAGCombinerInfo &DCI) const {
11306 SDValue N0 = N->getOperand(0);
11307 SDValue N1 = N->getOperand(1);
11308
11309 // We only do this to handle cases where it's profitable when there are
11310 // multiple uses of the add, so defer to the standard combine.
11311 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11312 N0->hasOneUse())
11313 return SDValue();
11314
11316 if (!CN1)
11317 return SDValue();
11318
11320 if (!CAdd)
11321 return SDValue();
11322
11323 SelectionDAG &DAG = DCI.DAG;
11324
11325 if (N0->getOpcode() == ISD::OR &&
11326 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11327 return SDValue();
11328
11329 // If the resulting offset is too large, we can't fold it into the
11330 // addressing mode offset.
11331 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11332 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11333
11334 AddrMode AM;
11335 AM.HasBaseReg = true;
11336 AM.BaseOffs = Offset.getSExtValue();
11337 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11338 return SDValue();
11339
11340 SDLoc SL(N);
11341 EVT VT = N->getValueType(0);
11342
11343 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11344 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11345
11346 SDNodeFlags Flags;
11347 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11348 (N0.getOpcode() == ISD::OR ||
11349 N0->getFlags().hasNoUnsignedWrap()));
11350
11351 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11352}
11353
11354/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11355/// by the chain and intrinsic ID. Theoretically we would also need to check the
11356/// specific intrinsic, but they all place the pointer operand first.
11357static unsigned getBasePtrIndex(const MemSDNode *N) {
11358 switch (N->getOpcode()) {
11359 case ISD::STORE:
11362 return 2;
11363 default:
11364 return 1;
11365 }
11366}
11367
11368SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11369 DAGCombinerInfo &DCI) const {
11370 SelectionDAG &DAG = DCI.DAG;
11371 SDLoc SL(N);
11372
11373 unsigned PtrIdx = getBasePtrIndex(N);
11374 SDValue Ptr = N->getOperand(PtrIdx);
11375
11376 // TODO: We could also do this for multiplies.
11377 if (Ptr.getOpcode() == ISD::SHL) {
11378 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11379 N->getMemoryVT(), DCI);
11380 if (NewPtr) {
11381 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11382
11383 NewOps[PtrIdx] = NewPtr;
11384 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11385 }
11386 }
11387
11388 return SDValue();
11389}
11390
11391static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11392 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11393 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11394 (Opc == ISD::XOR && Val == 0);
11395}
11396
11397// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11398// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11399// integer combine opportunities since most 64-bit operations are decomposed
11400// this way. TODO: We won't want this for SALU especially if it is an inline
11401// immediate.
11402SDValue SITargetLowering::splitBinaryBitConstantOp(
11403 DAGCombinerInfo &DCI,
11404 const SDLoc &SL,
11405 unsigned Opc, SDValue LHS,
11406 const ConstantSDNode *CRHS) const {
11407 uint64_t Val = CRHS->getZExtValue();
11408 uint32_t ValLo = Lo_32(Val);
11409 uint32_t ValHi = Hi_32(Val);
11411
11412 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11413 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11414 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11415 // If we need to materialize a 64-bit immediate, it will be split up later
11416 // anyway. Avoid creating the harder to understand 64-bit immediate
11417 // materialization.
11418 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11419 }
11420
11421 return SDValue();
11422}
11423
11425 if (V.getValueType() != MVT::i1)
11426 return false;
11427 switch (V.getOpcode()) {
11428 default:
11429 break;
11430 case ISD::SETCC:
11432 return true;
11433 case ISD::AND:
11434 case ISD::OR:
11435 case ISD::XOR:
11436 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11437 }
11438 return false;
11439}
11440
11441// If a constant has all zeroes or all ones within each byte return it.
11442// Otherwise return 0.
11444 // 0xff for any zero byte in the mask
11445 uint32_t ZeroByteMask = 0;
11446 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11447 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11448 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11449 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11450 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11451 if ((NonZeroByteMask & C) != NonZeroByteMask)
11452 return 0; // Partial bytes selected.
11453 return C;
11454}
11455
11456// Check if a node selects whole bytes from its operand 0 starting at a byte
11457// boundary while masking the rest. Returns select mask as in the v_perm_b32
11458// or -1 if not succeeded.
11459// Note byte select encoding:
11460// value 0-3 selects corresponding source byte;
11461// value 0xc selects zero;
11462// value 0xff selects 0xff.
11464 assert(V.getValueSizeInBits() == 32);
11465
11466 if (V.getNumOperands() != 2)
11467 return ~0;
11468
11469 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11470 if (!N1)
11471 return ~0;
11472
11473 uint32_t C = N1->getZExtValue();
11474
11475 switch (V.getOpcode()) {
11476 default:
11477 break;
11478 case ISD::AND:
11479 if (uint32_t ConstMask = getConstantPermuteMask(C))
11480 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11481 break;
11482
11483 case ISD::OR:
11484 if (uint32_t ConstMask = getConstantPermuteMask(C))
11485 return (0x03020100 & ~ConstMask) | ConstMask;
11486 break;
11487
11488 case ISD::SHL:
11489 if (C % 8)
11490 return ~0;
11491
11492 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11493
11494 case ISD::SRL:
11495 if (C % 8)
11496 return ~0;
11497
11498 return uint32_t(0x0c0c0c0c03020100ull >> C);
11499 }
11500
11501 return ~0;
11502}
11503
11504SDValue SITargetLowering::performAndCombine(SDNode *N,
11505 DAGCombinerInfo &DCI) const {
11506 if (DCI.isBeforeLegalize())
11507 return SDValue();
11508
11509 SelectionDAG &DAG = DCI.DAG;
11510 EVT VT = N->getValueType(0);
11511 SDValue LHS = N->getOperand(0);
11512 SDValue RHS = N->getOperand(1);
11513
11514
11515 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11516 if (VT == MVT::i64 && CRHS) {
11517 if (SDValue Split
11518 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11519 return Split;
11520 }
11521
11522 if (CRHS && VT == MVT::i32) {
11523 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11524 // nb = number of trailing zeroes in mask
11525 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11526 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11527 uint64_t Mask = CRHS->getZExtValue();
11528 unsigned Bits = llvm::popcount(Mask);
11529 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11530 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11531 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11532 unsigned Shift = CShift->getZExtValue();
11533 unsigned NB = CRHS->getAPIntValue().countr_zero();
11534 unsigned Offset = NB + Shift;
11535 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11536 SDLoc SL(N);
11537 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11538 LHS->getOperand(0),
11539 DAG.getConstant(Offset, SL, MVT::i32),
11540 DAG.getConstant(Bits, SL, MVT::i32));
11541 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11542 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11543 DAG.getValueType(NarrowVT));
11544 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11545 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11546 return Shl;
11547 }
11548 }
11549 }
11550
11551 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11552 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11553 isa<ConstantSDNode>(LHS.getOperand(2))) {
11554 uint32_t Sel = getConstantPermuteMask(Mask);
11555 if (!Sel)
11556 return SDValue();
11557
11558 // Select 0xc for all zero bytes
11559 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11560 SDLoc DL(N);
11561 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11562 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11563 }
11564 }
11565
11566 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11567 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11568 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11569 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11570 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11571
11572 SDValue X = LHS.getOperand(0);
11573 SDValue Y = RHS.getOperand(0);
11574 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11575 !isTypeLegal(X.getValueType()))
11576 return SDValue();
11577
11578 if (LCC == ISD::SETO) {
11579 if (X != LHS.getOperand(1))
11580 return SDValue();
11581
11582 if (RCC == ISD::SETUNE) {
11583 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11584 if (!C1 || !C1->isInfinity() || C1->isNegative())
11585 return SDValue();
11586
11593
11594 static_assert(((~(SIInstrFlags::S_NAN |
11597 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11598 "mask not equal");
11599
11600 SDLoc DL(N);
11601 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11602 X, DAG.getConstant(Mask, DL, MVT::i32));
11603 }
11604 }
11605 }
11606
11607 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11608 std::swap(LHS, RHS);
11609
11610 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11611 RHS.hasOneUse()) {
11612 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11613 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11614 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11615 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11616 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11617 (RHS.getOperand(0) == LHS.getOperand(0) &&
11618 LHS.getOperand(0) == LHS.getOperand(1))) {
11619 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11620 unsigned NewMask = LCC == ISD::SETO ?
11621 Mask->getZExtValue() & ~OrdMask :
11622 Mask->getZExtValue() & OrdMask;
11623
11624 SDLoc DL(N);
11625 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11626 DAG.getConstant(NewMask, DL, MVT::i32));
11627 }
11628 }
11629
11630 if (VT == MVT::i32 &&
11631 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11632 // and x, (sext cc from i1) => select cc, x, 0
11633 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11634 std::swap(LHS, RHS);
11635 if (isBoolSGPR(RHS.getOperand(0)))
11636 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11637 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11638 }
11639
11640 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11642 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11643 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11644 uint32_t LHSMask = getPermuteMask(LHS);
11645 uint32_t RHSMask = getPermuteMask(RHS);
11646 if (LHSMask != ~0u && RHSMask != ~0u) {
11647 // Canonicalize the expression in an attempt to have fewer unique masks
11648 // and therefore fewer registers used to hold the masks.
11649 if (LHSMask > RHSMask) {
11650 std::swap(LHSMask, RHSMask);
11651 std::swap(LHS, RHS);
11652 }
11653
11654 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11655 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11656 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11657 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11658
11659 // Check of we need to combine values from two sources within a byte.
11660 if (!(LHSUsedLanes & RHSUsedLanes) &&
11661 // If we select high and lower word keep it for SDWA.
11662 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11663 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11664 // Each byte in each mask is either selector mask 0-3, or has higher
11665 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11666 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11667 // mask which is not 0xff wins. By anding both masks we have a correct
11668 // result except that 0x0c shall be corrected to give 0x0c only.
11669 uint32_t Mask = LHSMask & RHSMask;
11670 for (unsigned I = 0; I < 32; I += 8) {
11671 uint32_t ByteSel = 0xff << I;
11672 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11673 Mask &= (0x0c << I) & 0xffffffff;
11674 }
11675
11676 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11677 // or 0x0c.
11678 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11679 SDLoc DL(N);
11680
11681 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11682 LHS.getOperand(0), RHS.getOperand(0),
11683 DAG.getConstant(Sel, DL, MVT::i32));
11684 }
11685 }
11686 }
11687
11688 return SDValue();
11689}
11690
11691// A key component of v_perm is a mapping between byte position of the src
11692// operands, and the byte position of the dest. To provide such, we need: 1. the
11693// node that provides x byte of the dest of the OR, and 2. the byte of the node
11694// used to provide that x byte. calculateByteProvider finds which node provides
11695// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11696// and finds an ultimate src and byte position For example: The supported
11697// LoadCombine pattern for vector loads is as follows
11698// t1
11699// or
11700// / \
11701// t2 t3
11702// zext shl
11703// | | \
11704// t4 t5 16
11705// or anyext
11706// / \ |
11707// t6 t7 t8
11708// srl shl or
11709// / | / \ / \
11710// t9 t10 t11 t12 t13 t14
11711// trunc* 8 trunc* 8 and and
11712// | | / | | \
11713// t15 t16 t17 t18 t19 t20
11714// trunc* 255 srl -256
11715// | / \
11716// t15 t15 16
11717//
11718// *In this example, the truncs are from i32->i16
11719//
11720// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11721// respectively. calculateSrcByte would find (given node) -> ultimate src &
11722// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11723// After finding the mapping, we can combine the tree into vperm t15, t16,
11724// 0x05000407
11725
11726// Find the source and byte position from a node.
11727// \p DestByte is the byte position of the dest of the or that the src
11728// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11729// dest of the or byte. \p Depth tracks how many recursive iterations we have
11730// performed.
11731static const std::optional<ByteProvider<SDValue>>
11732calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11733 unsigned Depth = 0) {
11734 // We may need to recursively traverse a series of SRLs
11735 if (Depth >= 6)
11736 return std::nullopt;
11737
11738 if (Op.getValueSizeInBits() < 8)
11739 return std::nullopt;
11740
11741 if (Op.getValueType().isVector())
11742 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11743
11744 switch (Op->getOpcode()) {
11745 case ISD::TRUNCATE: {
11746 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11747 }
11748
11749 case ISD::SIGN_EXTEND:
11750 case ISD::ZERO_EXTEND:
11752 SDValue NarrowOp = Op->getOperand(0);
11753 auto NarrowVT = NarrowOp.getValueType();
11754 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11755 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11756 NarrowVT = VTSign->getVT();
11757 }
11758 if (!NarrowVT.isByteSized())
11759 return std::nullopt;
11760 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11761
11762 if (SrcIndex >= NarrowByteWidth)
11763 return std::nullopt;
11764 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11765 }
11766
11767 case ISD::SRA:
11768 case ISD::SRL: {
11769 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11770 if (!ShiftOp)
11771 return std::nullopt;
11772
11773 uint64_t BitShift = ShiftOp->getZExtValue();
11774
11775 if (BitShift % 8 != 0)
11776 return std::nullopt;
11777
11778 SrcIndex += BitShift / 8;
11779
11780 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11781 }
11782
11783 default: {
11784 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11785 }
11786 }
11787 llvm_unreachable("fully handled switch");
11788}
11789
11790// For a byte position in the result of an Or, traverse the tree and find the
11791// node (and the byte of the node) which ultimately provides this {Or,
11792// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11793// the byte position of the Op that corresponds with the originally requested
11794// byte of the Or \p Depth tracks how many recursive iterations we have
11795// performed. \p StartingIndex is the originally requested byte of the Or
11796static const std::optional<ByteProvider<SDValue>>
11797calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11798 unsigned StartingIndex = 0) {
11799 // Finding Src tree of RHS of or typically requires at least 1 additional
11800 // depth
11801 if (Depth > 6)
11802 return std::nullopt;
11803
11804 unsigned BitWidth = Op.getScalarValueSizeInBits();
11805 if (BitWidth % 8 != 0)
11806 return std::nullopt;
11807 if (Index > BitWidth / 8 - 1)
11808 return std::nullopt;
11809
11810 bool IsVec = Op.getValueType().isVector();
11811 switch (Op.getOpcode()) {
11812 case ISD::OR: {
11813 if (IsVec)
11814 return std::nullopt;
11815
11816 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11817 StartingIndex);
11818 if (!RHS)
11819 return std::nullopt;
11820 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11821 StartingIndex);
11822 if (!LHS)
11823 return std::nullopt;
11824 // A well formed Or will have two ByteProviders for each byte, one of which
11825 // is constant zero
11826 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11827 return std::nullopt;
11828 if (!LHS || LHS->isConstantZero())
11829 return RHS;
11830 if (!RHS || RHS->isConstantZero())
11831 return LHS;
11832 return std::nullopt;
11833 }
11834
11835 case ISD::AND: {
11836 if (IsVec)
11837 return std::nullopt;
11838
11839 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11840 if (!BitMaskOp)
11841 return std::nullopt;
11842
11843 uint32_t BitMask = BitMaskOp->getZExtValue();
11844 // Bits we expect for our StartingIndex
11845 uint32_t IndexMask = 0xFF << (Index * 8);
11846
11847 if ((IndexMask & BitMask) != IndexMask) {
11848 // If the result of the and partially provides the byte, then it
11849 // is not well formatted
11850 if (IndexMask & BitMask)
11851 return std::nullopt;
11853 }
11854
11855 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11856 }
11857
11858 case ISD::FSHR: {
11859 if (IsVec)
11860 return std::nullopt;
11861
11862 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11863 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11864 if (!ShiftOp || Op.getValueType().isVector())
11865 return std::nullopt;
11866
11867 uint64_t BitsProvided = Op.getValueSizeInBits();
11868 if (BitsProvided % 8 != 0)
11869 return std::nullopt;
11870
11871 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11872 if (BitShift % 8)
11873 return std::nullopt;
11874
11875 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11876 uint64_t ByteShift = BitShift / 8;
11877
11878 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11879 uint64_t BytesProvided = BitsProvided / 8;
11880 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11881 NewIndex %= BytesProvided;
11882 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11883 }
11884
11885 case ISD::SRA:
11886 case ISD::SRL: {
11887 if (IsVec)
11888 return std::nullopt;
11889
11890 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11891 if (!ShiftOp)
11892 return std::nullopt;
11893
11894 uint64_t BitShift = ShiftOp->getZExtValue();
11895 if (BitShift % 8)
11896 return std::nullopt;
11897
11898 auto BitsProvided = Op.getScalarValueSizeInBits();
11899 if (BitsProvided % 8 != 0)
11900 return std::nullopt;
11901
11902 uint64_t BytesProvided = BitsProvided / 8;
11903 uint64_t ByteShift = BitShift / 8;
11904 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11905 // If the byte we are trying to provide (as tracked by index) falls in this
11906 // range, then the SRL provides the byte. The byte of interest of the src of
11907 // the SRL is Index + ByteShift
11908 return BytesProvided - ByteShift > Index
11909 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11910 Index + ByteShift)
11912 }
11913
11914 case ISD::SHL: {
11915 if (IsVec)
11916 return std::nullopt;
11917
11918 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11919 if (!ShiftOp)
11920 return std::nullopt;
11921
11922 uint64_t BitShift = ShiftOp->getZExtValue();
11923 if (BitShift % 8 != 0)
11924 return std::nullopt;
11925 uint64_t ByteShift = BitShift / 8;
11926
11927 // If we are shifting by an amount greater than (or equal to)
11928 // the index we are trying to provide, then it provides 0s. If not,
11929 // then this bytes are not definitively 0s, and the corresponding byte
11930 // of interest is Index - ByteShift of the src
11931 return Index < ByteShift
11933 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11934 Depth + 1, StartingIndex);
11935 }
11936 case ISD::ANY_EXTEND:
11937 case ISD::SIGN_EXTEND:
11938 case ISD::ZERO_EXTEND:
11940 case ISD::AssertZext:
11941 case ISD::AssertSext: {
11942 if (IsVec)
11943 return std::nullopt;
11944
11945 SDValue NarrowOp = Op->getOperand(0);
11946 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11947 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11948 Op->getOpcode() == ISD::AssertZext ||
11949 Op->getOpcode() == ISD::AssertSext) {
11950 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11951 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11952 }
11953 if (NarrowBitWidth % 8 != 0)
11954 return std::nullopt;
11955 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11956
11957 if (Index >= NarrowByteWidth)
11958 return Op.getOpcode() == ISD::ZERO_EXTEND
11959 ? std::optional<ByteProvider<SDValue>>(
11961 : std::nullopt;
11962 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
11963 }
11964
11965 case ISD::TRUNCATE: {
11966 if (IsVec)
11967 return std::nullopt;
11968
11969 uint64_t NarrowByteWidth = BitWidth / 8;
11970
11971 if (NarrowByteWidth >= Index) {
11972 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11973 StartingIndex);
11974 }
11975
11976 return std::nullopt;
11977 }
11978
11979 case ISD::CopyFromReg: {
11980 if (BitWidth / 8 > Index)
11981 return calculateSrcByte(Op, StartingIndex, Index);
11982
11983 return std::nullopt;
11984 }
11985
11986 case ISD::LOAD: {
11987 auto L = cast<LoadSDNode>(Op.getNode());
11988
11989 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11990 if (NarrowBitWidth % 8 != 0)
11991 return std::nullopt;
11992 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11993
11994 // If the width of the load does not reach byte we are trying to provide for
11995 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
11996 // question
11997 if (Index >= NarrowByteWidth) {
11998 return L->getExtensionType() == ISD::ZEXTLOAD
11999 ? std::optional<ByteProvider<SDValue>>(
12001 : std::nullopt;
12002 }
12003
12004 if (NarrowByteWidth > Index) {
12005 return calculateSrcByte(Op, StartingIndex, Index);
12006 }
12007
12008 return std::nullopt;
12009 }
12010
12011 case ISD::BSWAP: {
12012 if (IsVec)
12013 return std::nullopt;
12014
12015 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12016 Depth + 1, StartingIndex);
12017 }
12018
12020 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12021 if (!IdxOp)
12022 return std::nullopt;
12023 auto VecIdx = IdxOp->getZExtValue();
12024 auto ScalarSize = Op.getScalarValueSizeInBits();
12025 if (ScalarSize < 32)
12026 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12027 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12028 StartingIndex, Index);
12029 }
12030
12031 case AMDGPUISD::PERM: {
12032 if (IsVec)
12033 return std::nullopt;
12034
12035 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12036 if (!PermMask)
12037 return std::nullopt;
12038
12039 auto IdxMask =
12040 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12041 if (IdxMask > 0x07 && IdxMask != 0x0c)
12042 return std::nullopt;
12043
12044 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12045 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12046
12047 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12050 }
12051
12052 default: {
12053 return std::nullopt;
12054 }
12055 }
12056
12057 llvm_unreachable("fully handled switch");
12058}
12059
12060// Returns true if the Operand is a scalar and is 16 bits
12061static bool isExtendedFrom16Bits(SDValue &Operand) {
12062
12063 switch (Operand.getOpcode()) {
12064 case ISD::ANY_EXTEND:
12065 case ISD::SIGN_EXTEND:
12066 case ISD::ZERO_EXTEND: {
12067 auto OpVT = Operand.getOperand(0).getValueType();
12068 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12069 }
12070 case ISD::LOAD: {
12071 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12072 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12073 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12074 ExtType == ISD::EXTLOAD) {
12075 auto MemVT = L->getMemoryVT();
12076 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12077 }
12078 return L->getMemoryVT().getSizeInBits() == 16;
12079 }
12080 default:
12081 return false;
12082 }
12083}
12084
12085// Returns true if the mask matches consecutive bytes, and the first byte
12086// begins at a power of 2 byte offset from 0th byte
12087static bool addresses16Bits(int Mask) {
12088 int Low8 = Mask & 0xff;
12089 int Hi8 = (Mask & 0xff00) >> 8;
12090
12091 assert(Low8 < 8 && Hi8 < 8);
12092 // Are the bytes contiguous in the order of increasing addresses.
12093 bool IsConsecutive = (Hi8 - Low8 == 1);
12094 // Is the first byte at location that is aligned for 16 bit instructions.
12095 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12096 // In this case, we still need code to extract the 16 bit operand, so it
12097 // is better to use i8 v_perm
12098 bool Is16Aligned = !(Low8 % 2);
12099
12100 return IsConsecutive && Is16Aligned;
12101}
12102
12103// Do not lower into v_perm if the operands are actually 16 bit
12104// and the selected bits (based on PermMask) correspond with two
12105// easily addressable 16 bit operands.
12107 SDValue &OtherOp) {
12108 int Low16 = PermMask & 0xffff;
12109 int Hi16 = (PermMask & 0xffff0000) >> 16;
12110
12111 auto TempOp = peekThroughBitcasts(Op);
12112 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12113
12114 auto OpIs16Bit =
12115 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12116 if (!OpIs16Bit)
12117 return true;
12118
12119 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12120 isExtendedFrom16Bits(TempOtherOp);
12121 if (!OtherOpIs16Bit)
12122 return true;
12123
12124 // Do we cleanly address both
12125 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12126}
12127
12129 unsigned DWordOffset) {
12130 SDValue Ret;
12131
12132 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12133 // ByteProvider must be at least 8 bits
12134 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12135
12136 if (TypeSize <= 32)
12137 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12138
12139 if (Src.getValueType().isVector()) {
12140 auto ScalarTySize = Src.getScalarValueSizeInBits();
12141 auto ScalarTy = Src.getValueType().getScalarType();
12142 if (ScalarTySize == 32) {
12143 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12144 DAG.getConstant(DWordOffset, SL, MVT::i32));
12145 }
12146 if (ScalarTySize > 32) {
12147 Ret = DAG.getNode(
12148 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12149 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12150 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12151 if (ShiftVal)
12152 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12153 DAG.getConstant(ShiftVal, SL, MVT::i32));
12154 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12155 }
12156
12157 assert(ScalarTySize < 32);
12158 auto NumElements = TypeSize / ScalarTySize;
12159 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12160 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12161 auto NumElementsIn32 = 32 / ScalarTySize;
12162 auto NumAvailElements = DWordOffset < Trunc32Elements
12163 ? NumElementsIn32
12164 : NumElements - NormalizedTrunc;
12165
12167 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12168 NumAvailElements);
12169
12170 Ret = DAG.getBuildVector(
12171 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12172 VecSrcs);
12173 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12174 }
12175
12176 /// Scalar Type
12177 auto ShiftVal = 32 * DWordOffset;
12178 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12179 DAG.getConstant(ShiftVal, SL, MVT::i32));
12180 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12181}
12182
12184 SelectionDAG &DAG = DCI.DAG;
12185 [[maybe_unused]] EVT VT = N->getValueType(0);
12187
12188 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12189 assert(VT == MVT::i32);
12190 for (int i = 0; i < 4; i++) {
12191 // Find the ByteProvider that provides the ith byte of the result of OR
12192 std::optional<ByteProvider<SDValue>> P =
12193 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12194 // TODO support constantZero
12195 if (!P || P->isConstantZero())
12196 return SDValue();
12197
12198 PermNodes.push_back(*P);
12199 }
12200 if (PermNodes.size() != 4)
12201 return SDValue();
12202
12203 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12204 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12205 uint64_t PermMask = 0x00000000;
12206 for (size_t i = 0; i < PermNodes.size(); i++) {
12207 auto PermOp = PermNodes[i];
12208 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12209 // by sizeof(Src2) = 4
12210 int SrcByteAdjust = 4;
12211
12212 // If the Src uses a byte from a different DWORD, then it corresponds
12213 // with a difference source
12214 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12215 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12216 if (SecondSrc)
12217 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12218 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12219 return SDValue();
12220
12221 // Set the index of the second distinct Src node
12222 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12223 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12224 SrcByteAdjust = 0;
12225 }
12226 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12228 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12229 }
12230 SDLoc DL(N);
12231 SDValue Op = *PermNodes[FirstSrc.first].Src;
12232 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12233 assert(Op.getValueSizeInBits() == 32);
12234
12235 // Check that we are not just extracting the bytes in order from an op
12236 if (!SecondSrc) {
12237 int Low16 = PermMask & 0xffff;
12238 int Hi16 = (PermMask & 0xffff0000) >> 16;
12239
12240 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12241 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12242
12243 // The perm op would really just produce Op. So combine into Op
12244 if (WellFormedLow && WellFormedHi)
12245 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12246 }
12247
12248 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12249
12250 if (SecondSrc) {
12251 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12252 assert(OtherOp.getValueSizeInBits() == 32);
12253 }
12254
12255 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12256
12257 assert(Op.getValueType().isByteSized() &&
12258 OtherOp.getValueType().isByteSized());
12259
12260 // If the ultimate src is less than 32 bits, then we will only be
12261 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12262 // CalculateByteProvider would not have returned Op as source if we
12263 // used a byte that is outside its ValueType. Thus, we are free to
12264 // ANY_EXTEND as the extended bits are dont-cares.
12265 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12266 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12267
12268 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12269 DAG.getConstant(PermMask, DL, MVT::i32));
12270 }
12271 return SDValue();
12272}
12273
12274SDValue SITargetLowering::performOrCombine(SDNode *N,
12275 DAGCombinerInfo &DCI) const {
12276 SelectionDAG &DAG = DCI.DAG;
12277 SDValue LHS = N->getOperand(0);
12278 SDValue RHS = N->getOperand(1);
12279
12280 EVT VT = N->getValueType(0);
12281 if (VT == MVT::i1) {
12282 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12283 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12284 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12285 SDValue Src = LHS.getOperand(0);
12286 if (Src != RHS.getOperand(0))
12287 return SDValue();
12288
12289 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12290 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12291 if (!CLHS || !CRHS)
12292 return SDValue();
12293
12294 // Only 10 bits are used.
12295 static const uint32_t MaxMask = 0x3ff;
12296
12297 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12298 SDLoc DL(N);
12299 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12300 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12301 }
12302
12303 return SDValue();
12304 }
12305
12306 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12307 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12308 LHS.getOpcode() == AMDGPUISD::PERM &&
12309 isa<ConstantSDNode>(LHS.getOperand(2))) {
12310 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12311 if (!Sel)
12312 return SDValue();
12313
12314 Sel |= LHS.getConstantOperandVal(2);
12315 SDLoc DL(N);
12316 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12317 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12318 }
12319
12320 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12322 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12323 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12324
12325 // If all the uses of an or need to extract the individual elements, do not
12326 // attempt to lower into v_perm
12327 auto usesCombinedOperand = [](SDNode *OrUse) {
12328 // If we have any non-vectorized use, then it is a candidate for v_perm
12329 if (OrUse->getOpcode() != ISD::BITCAST ||
12330 !OrUse->getValueType(0).isVector())
12331 return true;
12332
12333 // If we have any non-vectorized use, then it is a candidate for v_perm
12334 for (auto VUse : OrUse->uses()) {
12335 if (!VUse->getValueType(0).isVector())
12336 return true;
12337
12338 // If the use of a vector is a store, then combining via a v_perm
12339 // is beneficial.
12340 // TODO -- whitelist more uses
12341 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12342 if (VUse->getOpcode() == VectorwiseOp)
12343 return true;
12344 }
12345 return false;
12346 };
12347
12348 if (!any_of(N->uses(), usesCombinedOperand))
12349 return SDValue();
12350
12351 uint32_t LHSMask = getPermuteMask(LHS);
12352 uint32_t RHSMask = getPermuteMask(RHS);
12353
12354 if (LHSMask != ~0u && RHSMask != ~0u) {
12355 // Canonicalize the expression in an attempt to have fewer unique masks
12356 // and therefore fewer registers used to hold the masks.
12357 if (LHSMask > RHSMask) {
12358 std::swap(LHSMask, RHSMask);
12359 std::swap(LHS, RHS);
12360 }
12361
12362 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12363 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12364 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12365 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12366
12367 // Check of we need to combine values from two sources within a byte.
12368 if (!(LHSUsedLanes & RHSUsedLanes) &&
12369 // If we select high and lower word keep it for SDWA.
12370 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12371 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12372 // Kill zero bytes selected by other mask. Zero value is 0xc.
12373 LHSMask &= ~RHSUsedLanes;
12374 RHSMask &= ~LHSUsedLanes;
12375 // Add 4 to each active LHS lane
12376 LHSMask |= LHSUsedLanes & 0x04040404;
12377 // Combine masks
12378 uint32_t Sel = LHSMask | RHSMask;
12379 SDLoc DL(N);
12380
12381 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12382 LHS.getOperand(0), RHS.getOperand(0),
12383 DAG.getConstant(Sel, DL, MVT::i32));
12384 }
12385 }
12386 if (LHSMask == ~0u || RHSMask == ~0u) {
12387 if (SDValue Perm = matchPERM(N, DCI))
12388 return Perm;
12389 }
12390 }
12391
12392 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12393 return SDValue();
12394
12395 // TODO: This could be a generic combine with a predicate for extracting the
12396 // high half of an integer being free.
12397
12398 // (or i64:x, (zero_extend i32:y)) ->
12399 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12400 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12401 RHS.getOpcode() != ISD::ZERO_EXTEND)
12402 std::swap(LHS, RHS);
12403
12404 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12405 SDValue ExtSrc = RHS.getOperand(0);
12406 EVT SrcVT = ExtSrc.getValueType();
12407 if (SrcVT == MVT::i32) {
12408 SDLoc SL(N);
12409 SDValue LowLHS, HiBits;
12410 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12411 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12412
12413 DCI.AddToWorklist(LowOr.getNode());
12414 DCI.AddToWorklist(HiBits.getNode());
12415
12416 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12417 LowOr, HiBits);
12418 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12419 }
12420 }
12421
12422 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12423 if (CRHS) {
12424 if (SDValue Split
12425 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12426 N->getOperand(0), CRHS))
12427 return Split;
12428 }
12429
12430 return SDValue();
12431}
12432
12433SDValue SITargetLowering::performXorCombine(SDNode *N,
12434 DAGCombinerInfo &DCI) const {
12435 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12436 return RV;
12437
12438 SDValue LHS = N->getOperand(0);
12439 SDValue RHS = N->getOperand(1);
12440
12441 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12442 SelectionDAG &DAG = DCI.DAG;
12443
12444 EVT VT = N->getValueType(0);
12445 if (CRHS && VT == MVT::i64) {
12446 if (SDValue Split
12447 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12448 return Split;
12449 }
12450
12451 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12452 // fneg-like xors into 64-bit select.
12453 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12454 // This looks like an fneg, try to fold as a source modifier.
12455 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12456 shouldFoldFNegIntoSrc(N, LHS)) {
12457 // xor (select c, a, b), 0x80000000 ->
12458 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12459 SDLoc DL(N);
12460 SDValue CastLHS =
12461 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12462 SDValue CastRHS =
12463 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12464 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12465 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12466 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12467 LHS->getOperand(0), FNegLHS, FNegRHS);
12468 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12469 }
12470 }
12471
12472 return SDValue();
12473}
12474
12475SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12476 DAGCombinerInfo &DCI) const {
12477 if (!Subtarget->has16BitInsts() ||
12478 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12479 return SDValue();
12480
12481 EVT VT = N->getValueType(0);
12482 if (VT != MVT::i32)
12483 return SDValue();
12484
12485 SDValue Src = N->getOperand(0);
12486 if (Src.getValueType() != MVT::i16)
12487 return SDValue();
12488
12489 return SDValue();
12490}
12491
12492SDValue
12493SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12494 DAGCombinerInfo &DCI) const {
12495 SDValue Src = N->getOperand(0);
12496 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12497
12498 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12499 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12500 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12501 VTSign->getVT() == MVT::i8) ||
12502 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12503 VTSign->getVT() == MVT::i16))) {
12504 assert(Subtarget->hasScalarSubwordLoads() &&
12505 "s_buffer_load_{u8, i8} are supported "
12506 "in GFX12 (or newer) architectures.");
12507 EVT VT = Src.getValueType();
12508 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12511 SDLoc DL(N);
12512 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12513 SDValue Ops[] = {
12514 Src.getOperand(0), // source register
12515 Src.getOperand(1), // offset
12516 Src.getOperand(2) // cachePolicy
12517 };
12518 auto *M = cast<MemSDNode>(Src);
12519 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12520 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12521 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12522 return LoadVal;
12523 }
12524 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12525 VTSign->getVT() == MVT::i8) ||
12526 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12527 VTSign->getVT() == MVT::i16)) &&
12528 Src.hasOneUse()) {
12529 auto *M = cast<MemSDNode>(Src);
12530 SDValue Ops[] = {
12531 Src.getOperand(0), // Chain
12532 Src.getOperand(1), // rsrc
12533 Src.getOperand(2), // vindex
12534 Src.getOperand(3), // voffset
12535 Src.getOperand(4), // soffset
12536 Src.getOperand(5), // offset
12537 Src.getOperand(6),
12538 Src.getOperand(7)
12539 };
12540 // replace with BUFFER_LOAD_BYTE/SHORT
12541 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12542 Src.getOperand(0).getValueType());
12543 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12545 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12546 ResList,
12547 Ops, M->getMemoryVT(),
12548 M->getMemOperand());
12549 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12550 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12551 }
12552 return SDValue();
12553}
12554
12555SDValue SITargetLowering::performClassCombine(SDNode *N,
12556 DAGCombinerInfo &DCI) const {
12557 SelectionDAG &DAG = DCI.DAG;
12558 SDValue Mask = N->getOperand(1);
12559
12560 // fp_class x, 0 -> false
12561 if (isNullConstant(Mask))
12562 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12563
12564 if (N->getOperand(0).isUndef())
12565 return DAG.getUNDEF(MVT::i1);
12566
12567 return SDValue();
12568}
12569
12570SDValue SITargetLowering::performRcpCombine(SDNode *N,
12571 DAGCombinerInfo &DCI) const {
12572 EVT VT = N->getValueType(0);
12573 SDValue N0 = N->getOperand(0);
12574
12575 if (N0.isUndef()) {
12576 return DCI.DAG.getConstantFP(
12578 VT);
12579 }
12580
12581 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12582 N0.getOpcode() == ISD::SINT_TO_FP)) {
12583 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12584 N->getFlags());
12585 }
12586
12587 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12588 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12589 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12590 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12591 N0.getOperand(0), N->getFlags());
12592 }
12593
12595}
12596
12598 unsigned MaxDepth) const {
12599 unsigned Opcode = Op.getOpcode();
12600 if (Opcode == ISD::FCANONICALIZE)
12601 return true;
12602
12603 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12604 const auto &F = CFP->getValueAPF();
12605 if (F.isNaN() && F.isSignaling())
12606 return false;
12607 if (!F.isDenormal())
12608 return true;
12609
12610 DenormalMode Mode =
12611 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12612 return Mode == DenormalMode::getIEEE();
12613 }
12614
12615 // If source is a result of another standard FP operation it is already in
12616 // canonical form.
12617 if (MaxDepth == 0)
12618 return false;
12619
12620 switch (Opcode) {
12621 // These will flush denorms if required.
12622 case ISD::FADD:
12623 case ISD::FSUB:
12624 case ISD::FMUL:
12625 case ISD::FCEIL:
12626 case ISD::FFLOOR:
12627 case ISD::FMA:
12628 case ISD::FMAD:
12629 case ISD::FSQRT:
12630 case ISD::FDIV:
12631 case ISD::FREM:
12632 case ISD::FP_ROUND:
12633 case ISD::FP_EXTEND:
12634 case ISD::FP16_TO_FP:
12635 case ISD::FP_TO_FP16:
12636 case ISD::BF16_TO_FP:
12637 case ISD::FP_TO_BF16:
12638 case ISD::FLDEXP:
12641 case AMDGPUISD::RCP:
12642 case AMDGPUISD::RSQ:
12646 case AMDGPUISD::LOG:
12647 case AMDGPUISD::EXP:
12651 case AMDGPUISD::FRACT:
12658 case AMDGPUISD::SIN_HW:
12659 case AMDGPUISD::COS_HW:
12660 return true;
12661
12662 // It can/will be lowered or combined as a bit operation.
12663 // Need to check their input recursively to handle.
12664 case ISD::FNEG:
12665 case ISD::FABS:
12666 case ISD::FCOPYSIGN:
12667 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12668
12669 case ISD::AND:
12670 if (Op.getValueType() == MVT::i32) {
12671 // Be careful as we only know it is a bitcast floating point type. It
12672 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12673 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12674 // is valid to optimize for all types.
12675 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12676 if (RHS->getZExtValue() == 0xffff0000) {
12677 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12678 }
12679 }
12680 }
12681 break;
12682
12683 case ISD::FSIN:
12684 case ISD::FCOS:
12685 case ISD::FSINCOS:
12686 return Op.getValueType().getScalarType() != MVT::f16;
12687
12688 case ISD::FMINNUM:
12689 case ISD::FMAXNUM:
12690 case ISD::FMINNUM_IEEE:
12691 case ISD::FMAXNUM_IEEE:
12692 case ISD::FMINIMUM:
12693 case ISD::FMAXIMUM:
12694 case AMDGPUISD::CLAMP:
12695 case AMDGPUISD::FMED3:
12696 case AMDGPUISD::FMAX3:
12697 case AMDGPUISD::FMIN3:
12699 case AMDGPUISD::FMINIMUM3: {
12700 // FIXME: Shouldn't treat the generic operations different based these.
12701 // However, we aren't really required to flush the result from
12702 // minnum/maxnum..
12703
12704 // snans will be quieted, so we only need to worry about denormals.
12705 if (Subtarget->supportsMinMaxDenormModes() ||
12706 // FIXME: denormalsEnabledForType is broken for dynamic
12707 denormalsEnabledForType(DAG, Op.getValueType()))
12708 return true;
12709
12710 // Flushing may be required.
12711 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12712 // targets need to check their input recursively.
12713
12714 // FIXME: Does this apply with clamp? It's implemented with max.
12715 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12716 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12717 return false;
12718 }
12719
12720 return true;
12721 }
12722 case ISD::SELECT: {
12723 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12724 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12725 }
12726 case ISD::BUILD_VECTOR: {
12727 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12728 SDValue SrcOp = Op.getOperand(i);
12729 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12730 return false;
12731 }
12732
12733 return true;
12734 }
12737 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12738 }
12740 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12741 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12742 }
12743 case ISD::UNDEF:
12744 // Could be anything.
12745 return false;
12746
12747 case ISD::BITCAST:
12748 // TODO: This is incorrect as it loses track of the operand's type. We may
12749 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12750 // same bits that are canonicalized in one type need not be in the other.
12751 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12752 case ISD::TRUNCATE: {
12753 // Hack round the mess we make when legalizing extract_vector_elt
12754 if (Op.getValueType() == MVT::i16) {
12755 SDValue TruncSrc = Op.getOperand(0);
12756 if (TruncSrc.getValueType() == MVT::i32 &&
12757 TruncSrc.getOpcode() == ISD::BITCAST &&
12758 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12759 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12760 }
12761 }
12762 return false;
12763 }
12765 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12766 // TODO: Handle more intrinsics
12767 switch (IntrinsicID) {
12768 case Intrinsic::amdgcn_cvt_pkrtz:
12769 case Intrinsic::amdgcn_cubeid:
12770 case Intrinsic::amdgcn_frexp_mant:
12771 case Intrinsic::amdgcn_fdot2:
12772 case Intrinsic::amdgcn_rcp:
12773 case Intrinsic::amdgcn_rsq:
12774 case Intrinsic::amdgcn_rsq_clamp:
12775 case Intrinsic::amdgcn_rcp_legacy:
12776 case Intrinsic::amdgcn_rsq_legacy:
12777 case Intrinsic::amdgcn_trig_preop:
12778 case Intrinsic::amdgcn_log:
12779 case Intrinsic::amdgcn_exp2:
12780 case Intrinsic::amdgcn_sqrt:
12781 return true;
12782 default:
12783 break;
12784 }
12785
12786 break;
12787 }
12788 default:
12789 break;
12790 }
12791
12792 // FIXME: denormalsEnabledForType is broken for dynamic
12793 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12794 DAG.isKnownNeverSNaN(Op);
12795}
12796
12798 unsigned MaxDepth) const {
12799 const MachineRegisterInfo &MRI = MF.getRegInfo();
12800 MachineInstr *MI = MRI.getVRegDef(Reg);
12801 unsigned Opcode = MI->getOpcode();
12802
12803 if (Opcode == AMDGPU::G_FCANONICALIZE)
12804 return true;
12805
12806 std::optional<FPValueAndVReg> FCR;
12807 // Constant splat (can be padded with undef) or scalar constant.
12808 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12809 if (FCR->Value.isSignaling())
12810 return false;
12811 if (!FCR->Value.isDenormal())
12812 return true;
12813
12814 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12815 return Mode == DenormalMode::getIEEE();
12816 }
12817
12818 if (MaxDepth == 0)
12819 return false;
12820
12821 switch (Opcode) {
12822 case AMDGPU::G_FADD:
12823 case AMDGPU::G_FSUB:
12824 case AMDGPU::G_FMUL:
12825 case AMDGPU::G_FCEIL:
12826 case AMDGPU::G_FFLOOR:
12827 case AMDGPU::G_FRINT:
12828 case AMDGPU::G_FNEARBYINT:
12829 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12830 case AMDGPU::G_INTRINSIC_TRUNC:
12831 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12832 case AMDGPU::G_FMA:
12833 case AMDGPU::G_FMAD:
12834 case AMDGPU::G_FSQRT:
12835 case AMDGPU::G_FDIV:
12836 case AMDGPU::G_FREM:
12837 case AMDGPU::G_FPOW:
12838 case AMDGPU::G_FPEXT:
12839 case AMDGPU::G_FLOG:
12840 case AMDGPU::G_FLOG2:
12841 case AMDGPU::G_FLOG10:
12842 case AMDGPU::G_FPTRUNC:
12843 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12844 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12845 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12846 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12847 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12848 return true;
12849 case AMDGPU::G_FNEG:
12850 case AMDGPU::G_FABS:
12851 case AMDGPU::G_FCOPYSIGN:
12852 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12853 case AMDGPU::G_FMINNUM:
12854 case AMDGPU::G_FMAXNUM:
12855 case AMDGPU::G_FMINNUM_IEEE:
12856 case AMDGPU::G_FMAXNUM_IEEE:
12857 case AMDGPU::G_FMINIMUM:
12858 case AMDGPU::G_FMAXIMUM: {
12859 if (Subtarget->supportsMinMaxDenormModes() ||
12860 // FIXME: denormalsEnabledForType is broken for dynamic
12861 denormalsEnabledForType(MRI.getType(Reg), MF))
12862 return true;
12863
12864 [[fallthrough]];
12865 }
12866 case AMDGPU::G_BUILD_VECTOR:
12867 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12868 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12869 return false;
12870 return true;
12871 case AMDGPU::G_INTRINSIC:
12872 case AMDGPU::G_INTRINSIC_CONVERGENT:
12873 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12874 case Intrinsic::amdgcn_fmul_legacy:
12875 case Intrinsic::amdgcn_fmad_ftz:
12876 case Intrinsic::amdgcn_sqrt:
12877 case Intrinsic::amdgcn_fmed3:
12878 case Intrinsic::amdgcn_sin:
12879 case Intrinsic::amdgcn_cos:
12880 case Intrinsic::amdgcn_log:
12881 case Intrinsic::amdgcn_exp2:
12882 case Intrinsic::amdgcn_log_clamp:
12883 case Intrinsic::amdgcn_rcp:
12884 case Intrinsic::amdgcn_rcp_legacy:
12885 case Intrinsic::amdgcn_rsq:
12886 case Intrinsic::amdgcn_rsq_clamp:
12887 case Intrinsic::amdgcn_rsq_legacy:
12888 case Intrinsic::amdgcn_div_scale:
12889 case Intrinsic::amdgcn_div_fmas:
12890 case Intrinsic::amdgcn_div_fixup:
12891 case Intrinsic::amdgcn_fract:
12892 case Intrinsic::amdgcn_cvt_pkrtz:
12893 case Intrinsic::amdgcn_cubeid:
12894 case Intrinsic::amdgcn_cubema:
12895 case Intrinsic::amdgcn_cubesc:
12896 case Intrinsic::amdgcn_cubetc:
12897 case Intrinsic::amdgcn_frexp_mant:
12898 case Intrinsic::amdgcn_fdot2:
12899 case Intrinsic::amdgcn_trig_preop:
12900 return true;
12901 default:
12902 break;
12903 }
12904
12905 [[fallthrough]];
12906 default:
12907 return false;
12908 }
12909
12910 llvm_unreachable("invalid operation");
12911}
12912
12913// Constant fold canonicalize.
12914SDValue SITargetLowering::getCanonicalConstantFP(
12915 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12916 // Flush denormals to 0 if not enabled.
12917 if (C.isDenormal()) {
12918 DenormalMode Mode =
12919 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12920 if (Mode == DenormalMode::getPreserveSign()) {
12921 return DAG.getConstantFP(
12922 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12923 }
12924
12925 if (Mode != DenormalMode::getIEEE())
12926 return SDValue();
12927 }
12928
12929 if (C.isNaN()) {
12930 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12931 if (C.isSignaling()) {
12932 // Quiet a signaling NaN.
12933 // FIXME: Is this supposed to preserve payload bits?
12934 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12935 }
12936
12937 // Make sure it is the canonical NaN bitpattern.
12938 //
12939 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12940 // immediate?
12941 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12942 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12943 }
12944
12945 // Already canonical.
12946 return DAG.getConstantFP(C, SL, VT);
12947}
12948
12950 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12951}
12952
12953SDValue SITargetLowering::performFCanonicalizeCombine(
12954 SDNode *N,
12955 DAGCombinerInfo &DCI) const {
12956 SelectionDAG &DAG = DCI.DAG;
12957 SDValue N0 = N->getOperand(0);
12958 EVT VT = N->getValueType(0);
12959
12960 // fcanonicalize undef -> qnan
12961 if (N0.isUndef()) {
12963 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
12964 }
12965
12966 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
12967 EVT VT = N->getValueType(0);
12968 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12969 }
12970
12971 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12972 // (fcanonicalize k)
12973 //
12974 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12975
12976 // TODO: This could be better with wider vectors that will be split to v2f16,
12977 // and to consider uses since there aren't that many packed operations.
12978 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12979 isTypeLegal(MVT::v2f16)) {
12980 SDLoc SL(N);
12981 SDValue NewElts[2];
12982 SDValue Lo = N0.getOperand(0);
12983 SDValue Hi = N0.getOperand(1);
12984 EVT EltVT = Lo.getValueType();
12985
12987 for (unsigned I = 0; I != 2; ++I) {
12988 SDValue Op = N0.getOperand(I);
12990 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
12991 CFP->getValueAPF());
12992 } else if (Op.isUndef()) {
12993 // Handled below based on what the other operand is.
12994 NewElts[I] = Op;
12995 } else {
12996 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
12997 }
12998 }
12999
13000 // If one half is undef, and one is constant, prefer a splat vector rather
13001 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13002 // cheaper to use and may be free with a packed operation.
13003 if (NewElts[0].isUndef()) {
13004 if (isa<ConstantFPSDNode>(NewElts[1]))
13005 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13006 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
13007 }
13008
13009 if (NewElts[1].isUndef()) {
13010 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13011 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
13012 }
13013
13014 return DAG.getBuildVector(VT, SL, NewElts);
13015 }
13016 }
13017
13018 return SDValue();
13019}
13020
13021static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13022 switch (Opc) {
13023 case ISD::FMAXNUM:
13024 case ISD::FMAXNUM_IEEE:
13025 return AMDGPUISD::FMAX3;
13026 case ISD::FMAXIMUM:
13027 return AMDGPUISD::FMAXIMUM3;
13028 case ISD::SMAX:
13029 return AMDGPUISD::SMAX3;
13030 case ISD::UMAX:
13031 return AMDGPUISD::UMAX3;
13032 case ISD::FMINNUM:
13033 case ISD::FMINNUM_IEEE:
13034 return AMDGPUISD::FMIN3;
13035 case ISD::FMINIMUM:
13036 return AMDGPUISD::FMINIMUM3;
13037 case ISD::SMIN:
13038 return AMDGPUISD::SMIN3;
13039 case ISD::UMIN:
13040 return AMDGPUISD::UMIN3;
13041 default:
13042 llvm_unreachable("Not a min/max opcode");
13043 }
13044}
13045
13046SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13047 const SDLoc &SL, SDValue Src,
13048 SDValue MinVal,
13049 SDValue MaxVal,
13050 bool Signed) const {
13051
13052 // med3 comes from
13053 // min(max(x, K0), K1), K0 < K1
13054 // max(min(x, K0), K1), K1 < K0
13055 //
13056 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13057 // min/max op.
13060
13061 if (!MinK || !MaxK)
13062 return SDValue();
13063
13064 if (Signed) {
13065 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13066 return SDValue();
13067 } else {
13068 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13069 return SDValue();
13070 }
13071
13072 EVT VT = MinK->getValueType(0);
13073 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13074 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13075 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13076
13077 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13078 // not available, but this is unlikely to be profitable as constants
13079 // will often need to be materialized & extended, especially on
13080 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13081 return SDValue();
13082}
13083
13086 return C;
13087
13089 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13090 return C;
13091 }
13092
13093 return nullptr;
13094}
13095
13096SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13097 const SDLoc &SL,
13098 SDValue Op0,
13099 SDValue Op1) const {
13101 if (!K1)
13102 return SDValue();
13103
13105 if (!K0)
13106 return SDValue();
13107
13108 // Ordered >= (although NaN inputs should have folded away by now).
13109 if (K0->getValueAPF() > K1->getValueAPF())
13110 return SDValue();
13111
13112 const MachineFunction &MF = DAG.getMachineFunction();
13114
13115 // TODO: Check IEEE bit enabled?
13116 EVT VT = Op0.getValueType();
13117 if (Info->getMode().DX10Clamp) {
13118 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13119 // hardware fmed3 behavior converting to a min.
13120 // FIXME: Should this be allowing -0.0?
13121 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13122 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13123 }
13124
13125 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13126 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13127 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13128 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13129 // then give the other result, which is different from med3 with a NaN
13130 // input.
13131 SDValue Var = Op0.getOperand(0);
13132 if (!DAG.isKnownNeverSNaN(Var))
13133 return SDValue();
13134
13136
13137 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13138 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13139 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13140 Var, SDValue(K0, 0), SDValue(K1, 0));
13141 }
13142 }
13143
13144 return SDValue();
13145}
13146
13147/// \return true if the subtarget supports minimum3 and maximum3 with the given
13148/// base min/max opcode \p Opc for type \p VT.
13149static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13150 EVT VT) {
13151 switch (Opc) {
13152 case ISD::FMINNUM:
13153 case ISD::FMAXNUM:
13154 case ISD::FMINNUM_IEEE:
13155 case ISD::FMAXNUM_IEEE:
13158 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13159 case ISD::FMINIMUM:
13160 case ISD::FMAXIMUM:
13161 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
13162 case ISD::SMAX:
13163 case ISD::SMIN:
13164 case ISD::UMAX:
13165 case ISD::UMIN:
13166 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13167 default:
13168 return false;
13169 }
13170
13171 llvm_unreachable("not a min/max opcode");
13172}
13173
13174SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13175 DAGCombinerInfo &DCI) const {
13176 SelectionDAG &DAG = DCI.DAG;
13177
13178 EVT VT = N->getValueType(0);
13179 unsigned Opc = N->getOpcode();
13180 SDValue Op0 = N->getOperand(0);
13181 SDValue Op1 = N->getOperand(1);
13182
13183 // Only do this if the inner op has one use since this will just increases
13184 // register pressure for no benefit.
13185
13186 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13187 // max(max(a, b), c) -> max3(a, b, c)
13188 // min(min(a, b), c) -> min3(a, b, c)
13189 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13190 SDLoc DL(N);
13191 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13192 DL,
13193 N->getValueType(0),
13194 Op0.getOperand(0),
13195 Op0.getOperand(1),
13196 Op1);
13197 }
13198
13199 // Try commuted.
13200 // max(a, max(b, c)) -> max3(a, b, c)
13201 // min(a, min(b, c)) -> min3(a, b, c)
13202 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13203 SDLoc DL(N);
13204 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13205 DL,
13206 N->getValueType(0),
13207 Op0,
13208 Op1.getOperand(0),
13209 Op1.getOperand(1));
13210 }
13211 }
13212
13213 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13214 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13215 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13216 if (SDValue Med3 = performIntMed3ImmCombine(
13217 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13218 return Med3;
13219 }
13220 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13221 if (SDValue Med3 = performIntMed3ImmCombine(
13222 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13223 return Med3;
13224 }
13225
13226 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13227 if (SDValue Med3 = performIntMed3ImmCombine(
13228 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13229 return Med3;
13230 }
13231 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13232 if (SDValue Med3 = performIntMed3ImmCombine(
13233 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13234 return Med3;
13235 }
13236
13237 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13238 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13239 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13240 (Opc == AMDGPUISD::FMIN_LEGACY &&
13241 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13242 (VT == MVT::f32 || VT == MVT::f64 ||
13243 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13244 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13245 Op0.hasOneUse()) {
13246 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13247 return Res;
13248 }
13249
13250 return SDValue();
13251}
13252
13256 // FIXME: Should this be allowing -0.0?
13257 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13258 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13259 }
13260 }
13261
13262 return false;
13263}
13264
13265// FIXME: Should only worry about snans for version with chain.
13266SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13267 DAGCombinerInfo &DCI) const {
13268 EVT VT = N->getValueType(0);
13269 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13270 // NaNs. With a NaN input, the order of the operands may change the result.
13271
13272 SelectionDAG &DAG = DCI.DAG;
13273 SDLoc SL(N);
13274
13275 SDValue Src0 = N->getOperand(0);
13276 SDValue Src1 = N->getOperand(1);
13277 SDValue Src2 = N->getOperand(2);
13278
13279 if (isClampZeroToOne(Src0, Src1)) {
13280 // const_a, const_b, x -> clamp is safe in all cases including signaling
13281 // nans.
13282 // FIXME: Should this be allowing -0.0?
13283 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13284 }
13285
13286 const MachineFunction &MF = DAG.getMachineFunction();
13288
13289 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13290 // handling no dx10-clamp?
13291 if (Info->getMode().DX10Clamp) {
13292 // If NaNs is clamped to 0, we are free to reorder the inputs.
13293
13294 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13295 std::swap(Src0, Src1);
13296
13297 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13298 std::swap(Src1, Src2);
13299
13300 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13301 std::swap(Src0, Src1);
13302
13303 if (isClampZeroToOne(Src1, Src2))
13304 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13305 }
13306
13307 return SDValue();
13308}
13309
13310SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13311 DAGCombinerInfo &DCI) const {
13312 SDValue Src0 = N->getOperand(0);
13313 SDValue Src1 = N->getOperand(1);
13314 if (Src0.isUndef() && Src1.isUndef())
13315 return DCI.DAG.getUNDEF(N->getValueType(0));
13316 return SDValue();
13317}
13318
13319// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13320// expanded into a set of cmp/select instructions.
13322 unsigned NumElem,
13323 bool IsDivergentIdx,
13324 const GCNSubtarget *Subtarget) {
13326 return false;
13327
13328 unsigned VecSize = EltSize * NumElem;
13329
13330 // Sub-dword vectors of size 2 dword or less have better implementation.
13331 if (VecSize <= 64 && EltSize < 32)
13332 return false;
13333
13334 // Always expand the rest of sub-dword instructions, otherwise it will be
13335 // lowered via memory.
13336 if (EltSize < 32)
13337 return true;
13338
13339 // Always do this if var-idx is divergent, otherwise it will become a loop.
13340 if (IsDivergentIdx)
13341 return true;
13342
13343 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13344 unsigned NumInsts = NumElem /* Number of compares */ +
13345 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13346
13347 // On some architectures (GFX9) movrel is not available and it's better
13348 // to expand.
13349 if (!Subtarget->hasMovrel())
13350 return NumInsts <= 16;
13351
13352 // If movrel is available, use it instead of expanding for vector of 8
13353 // elements.
13354 return NumInsts <= 15;
13355}
13356
13358 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13359 if (isa<ConstantSDNode>(Idx))
13360 return false;
13361
13362 SDValue Vec = N->getOperand(0);
13363 EVT VecVT = Vec.getValueType();
13364 EVT EltVT = VecVT.getVectorElementType();
13365 unsigned EltSize = EltVT.getSizeInBits();
13366 unsigned NumElem = VecVT.getVectorNumElements();
13367
13369 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13370}
13371
13372SDValue SITargetLowering::performExtractVectorEltCombine(
13373 SDNode *N, DAGCombinerInfo &DCI) const {
13374 SDValue Vec = N->getOperand(0);
13375 SelectionDAG &DAG = DCI.DAG;
13376
13377 EVT VecVT = Vec.getValueType();
13378 EVT VecEltVT = VecVT.getVectorElementType();
13379 EVT ResVT = N->getValueType(0);
13380
13381 unsigned VecSize = VecVT.getSizeInBits();
13382 unsigned VecEltSize = VecEltVT.getSizeInBits();
13383
13384 if ((Vec.getOpcode() == ISD::FNEG ||
13385 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
13386 SDLoc SL(N);
13387 SDValue Idx = N->getOperand(1);
13388 SDValue Elt =
13389 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13390 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13391 }
13392
13393 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13394 // =>
13395 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13396 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13397 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13398 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13399 SDLoc SL(N);
13400 SDValue Idx = N->getOperand(1);
13401 unsigned Opc = Vec.getOpcode();
13402
13403 switch(Opc) {
13404 default:
13405 break;
13406 // TODO: Support other binary operations.
13407 case ISD::FADD:
13408 case ISD::FSUB:
13409 case ISD::FMUL:
13410 case ISD::ADD:
13411 case ISD::UMIN:
13412 case ISD::UMAX:
13413 case ISD::SMIN:
13414 case ISD::SMAX:
13415 case ISD::FMAXNUM:
13416 case ISD::FMINNUM:
13417 case ISD::FMAXNUM_IEEE:
13418 case ISD::FMINNUM_IEEE:
13419 case ISD::FMAXIMUM:
13420 case ISD::FMINIMUM: {
13421 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13422 Vec.getOperand(0), Idx);
13423 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13424 Vec.getOperand(1), Idx);
13425
13426 DCI.AddToWorklist(Elt0.getNode());
13427 DCI.AddToWorklist(Elt1.getNode());
13428 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13429 }
13430 }
13431 }
13432
13433 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13435 SDLoc SL(N);
13436 SDValue Idx = N->getOperand(1);
13437 SDValue V;
13438 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13439 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13440 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13441 if (I == 0)
13442 V = Elt;
13443 else
13444 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13445 }
13446 return V;
13447 }
13448
13449 if (!DCI.isBeforeLegalize())
13450 return SDValue();
13451
13452 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13453 // elements. This exposes more load reduction opportunities by replacing
13454 // multiple small extract_vector_elements with a single 32-bit extract.
13455 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13456 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13457 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13458 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13459
13460 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13461 unsigned EltIdx = BitIndex / 32;
13462 unsigned LeftoverBitIdx = BitIndex % 32;
13463 SDLoc SL(N);
13464
13465 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13466 DCI.AddToWorklist(Cast.getNode());
13467
13468 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13469 DAG.getConstant(EltIdx, SL, MVT::i32));
13470 DCI.AddToWorklist(Elt.getNode());
13471 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13472 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13473 DCI.AddToWorklist(Srl.getNode());
13474
13475 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13476 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13477 DCI.AddToWorklist(Trunc.getNode());
13478
13479 if (VecEltVT == ResVT) {
13480 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13481 }
13482
13483 assert(ResVT.isScalarInteger());
13484 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13485 }
13486
13487 return SDValue();
13488}
13489
13490SDValue
13491SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13492 DAGCombinerInfo &DCI) const {
13493 SDValue Vec = N->getOperand(0);
13494 SDValue Idx = N->getOperand(2);
13495 EVT VecVT = Vec.getValueType();
13496 EVT EltVT = VecVT.getVectorElementType();
13497
13498 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13499 // => BUILD_VECTOR n x select (e, const-idx)
13501 return SDValue();
13502
13503 SelectionDAG &DAG = DCI.DAG;
13504 SDLoc SL(N);
13505 SDValue Ins = N->getOperand(1);
13506 EVT IdxVT = Idx.getValueType();
13507
13509 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13510 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13511 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13512 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13513 Ops.push_back(V);
13514 }
13515
13516 return DAG.getBuildVector(VecVT, SL, Ops);
13517}
13518
13519/// Return the source of an fp_extend from f16 to f32, or a converted FP
13520/// constant.
13522 if (Src.getOpcode() == ISD::FP_EXTEND &&
13523 Src.getOperand(0).getValueType() == MVT::f16) {
13524 return Src.getOperand(0);
13525 }
13526
13527 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13528 APFloat Val = CFP->getValueAPF();
13529 bool LosesInfo = true;
13531 if (!LosesInfo)
13532 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13533 }
13534
13535 return SDValue();
13536}
13537
13538SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13539 DAGCombinerInfo &DCI) const {
13540 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13541 "combine only useful on gfx8");
13542
13543 SDValue TruncSrc = N->getOperand(0);
13544 EVT VT = N->getValueType(0);
13545 if (VT != MVT::f16)
13546 return SDValue();
13547
13548 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13549 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13550 return SDValue();
13551
13552 SelectionDAG &DAG = DCI.DAG;
13553 SDLoc SL(N);
13554
13555 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13556 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13557 // casting back.
13558
13559 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13560 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13561 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13562 if (!A)
13563 return SDValue();
13564
13565 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13566 if (!B)
13567 return SDValue();
13568
13569 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13570 if (!C)
13571 return SDValue();
13572
13573 // This changes signaling nan behavior. If an input is a signaling nan, it
13574 // would have been quieted by the fpext originally. We don't care because
13575 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13576 // we would be worse off than just doing the promotion.
13577 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13578 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13579 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13580 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13581}
13582
13583unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13584 const SDNode *N0,
13585 const SDNode *N1) const {
13586 EVT VT = N0->getValueType(0);
13587
13588 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13589 // support denormals ever.
13590 if (((VT == MVT::f32 &&
13592 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13595 return ISD::FMAD;
13596
13597 const TargetOptions &Options = DAG.getTarget().Options;
13598 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13599 (N0->getFlags().hasAllowContract() &&
13600 N1->getFlags().hasAllowContract())) &&
13602 return ISD::FMA;
13603 }
13604
13605 return 0;
13606}
13607
13608// For a reassociatable opcode perform:
13609// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13610SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13611 SelectionDAG &DAG) const {
13612 EVT VT = N->getValueType(0);
13613 if (VT != MVT::i32 && VT != MVT::i64)
13614 return SDValue();
13615
13616 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13617 return SDValue();
13618
13619 unsigned Opc = N->getOpcode();
13620 SDValue Op0 = N->getOperand(0);
13621 SDValue Op1 = N->getOperand(1);
13622
13623 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13624 return SDValue();
13625
13626 if (Op0->isDivergent())
13627 std::swap(Op0, Op1);
13628
13629 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13630 return SDValue();
13631
13632 SDValue Op2 = Op1.getOperand(1);
13633 Op1 = Op1.getOperand(0);
13634 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13635 return SDValue();
13636
13637 if (Op1->isDivergent())
13638 std::swap(Op1, Op2);
13639
13640 SDLoc SL(N);
13641 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13642 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13643}
13644
13645static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13646 EVT VT,
13647 SDValue N0, SDValue N1, SDValue N2,
13648 bool Signed) {
13650 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13651 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13652 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13653}
13654
13655// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13656// multiplies, if any.
13657//
13658// Full 64-bit multiplies that feed into an addition are lowered here instead
13659// of using the generic expansion. The generic expansion ends up with
13660// a tree of ADD nodes that prevents us from using the "add" part of the
13661// MAD instruction. The expansion produced here results in a chain of ADDs
13662// instead of a tree.
13663SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13664 DAGCombinerInfo &DCI) const {
13665 assert(N->getOpcode() == ISD::ADD);
13666
13667 SelectionDAG &DAG = DCI.DAG;
13668 EVT VT = N->getValueType(0);
13669 SDLoc SL(N);
13670 SDValue LHS = N->getOperand(0);
13671 SDValue RHS = N->getOperand(1);
13672
13673 if (VT.isVector())
13674 return SDValue();
13675
13676 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13677 // result in scalar registers for uniform values.
13678 if (!N->isDivergent() && Subtarget->hasSMulHi())
13679 return SDValue();
13680
13681 unsigned NumBits = VT.getScalarSizeInBits();
13682 if (NumBits <= 32 || NumBits > 64)
13683 return SDValue();
13684
13685 if (LHS.getOpcode() != ISD::MUL) {
13686 assert(RHS.getOpcode() == ISD::MUL);
13687 std::swap(LHS, RHS);
13688 }
13689
13690 // Avoid the fold if it would unduly increase the number of multiplies due to
13691 // multiple uses, except on hardware with full-rate multiply-add (which is
13692 // part of full-rate 64-bit ops).
13693 if (!Subtarget->hasFullRate64Ops()) {
13694 unsigned NumUsers = 0;
13695 for (SDNode *Use : LHS->uses()) {
13696 // There is a use that does not feed into addition, so the multiply can't
13697 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13698 if (Use->getOpcode() != ISD::ADD)
13699 return SDValue();
13700
13701 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13702 // MUL + 3xADD + 3xADDC over 3xMAD.
13703 ++NumUsers;
13704 if (NumUsers >= 3)
13705 return SDValue();
13706 }
13707 }
13708
13709 SDValue MulLHS = LHS.getOperand(0);
13710 SDValue MulRHS = LHS.getOperand(1);
13711 SDValue AddRHS = RHS;
13712
13713 // Always check whether operands are small unsigned values, since that
13714 // knowledge is useful in more cases. Check for small signed values only if
13715 // doing so can unlock a shorter code sequence.
13716 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13717 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13718
13719 bool MulSignedLo = false;
13720 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13721 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13722 numBitsSigned(MulRHS, DAG) <= 32;
13723 }
13724
13725 // The operands and final result all have the same number of bits. If
13726 // operands need to be extended, they can be extended with garbage. The
13727 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13728 // truncated away in the end.
13729 if (VT != MVT::i64) {
13730 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13731 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13732 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13733 }
13734
13735 // The basic code generated is conceptually straightforward. Pseudo code:
13736 //
13737 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13738 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13739 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13740 //
13741 // The second and third lines are optional, depending on whether the factors
13742 // are {sign,zero}-extended or not.
13743 //
13744 // The actual DAG is noisier than the pseudo code, but only due to
13745 // instructions that disassemble values into low and high parts, and
13746 // assemble the final result.
13747 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13748
13749 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13750 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13751 SDValue Accum =
13752 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13753
13754 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13755 SDValue AccumLo, AccumHi;
13756 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13757
13758 if (!MulLHSUnsigned32) {
13759 auto MulLHSHi =
13760 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13761 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13762 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13763 }
13764
13765 if (!MulRHSUnsigned32) {
13766 auto MulRHSHi =
13767 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13768 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13769 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13770 }
13771
13772 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13773 Accum = DAG.getBitcast(MVT::i64, Accum);
13774 }
13775
13776 if (VT != MVT::i64)
13777 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13778 return Accum;
13779}
13780
13781// Collect the ultimate src of each of the mul node's operands, and confirm
13782// each operand is 8 bytes.
13783static std::optional<ByteProvider<SDValue>>
13784handleMulOperand(const SDValue &MulOperand) {
13785 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13786 if (!Byte0 || Byte0->isConstantZero()) {
13787 return std::nullopt;
13788 }
13789 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13790 if (Byte1 && !Byte1->isConstantZero()) {
13791 return std::nullopt;
13792 }
13793 return Byte0;
13794}
13795
13796static unsigned addPermMasks(unsigned First, unsigned Second) {
13797 unsigned FirstCs = First & 0x0c0c0c0c;
13798 unsigned SecondCs = Second & 0x0c0c0c0c;
13799 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13800 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13801
13802 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13803 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13804 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13805 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13806
13807 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13808}
13809
13810struct DotSrc {
13812 int64_t PermMask;
13814};
13815
13819 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13820
13821 assert(Src0.Src.has_value() && Src1.Src.has_value());
13822 // Src0s and Src1s are empty, just place arbitrarily.
13823 if (Step == 0) {
13824 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13825 Src0.SrcOffset / 4});
13826 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13827 Src1.SrcOffset / 4});
13828 return;
13829 }
13830
13831 for (int BPI = 0; BPI < 2; BPI++) {
13832 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13833 if (BPI == 1) {
13834 BPP = {Src1, Src0};
13835 }
13836 unsigned ZeroMask = 0x0c0c0c0c;
13837 unsigned FMask = 0xFF << (8 * (3 - Step));
13838
13839 unsigned FirstMask =
13840 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13841 unsigned SecondMask =
13842 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13843 // Attempt to find Src vector which contains our SDValue, if so, add our
13844 // perm mask to the existing one. If we are unable to find a match for the
13845 // first SDValue, attempt to find match for the second.
13846 int FirstGroup = -1;
13847 for (int I = 0; I < 2; I++) {
13848 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13849 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13850 return IterElt.SrcOp == *BPP.first.Src &&
13851 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13852 };
13853
13854 auto Match = llvm::find_if(Srcs, MatchesFirst);
13855 if (Match != Srcs.end()) {
13856 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13857 FirstGroup = I;
13858 break;
13859 }
13860 }
13861 if (FirstGroup != -1) {
13862 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13863 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13864 return IterElt.SrcOp == *BPP.second.Src &&
13865 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13866 };
13867 auto Match = llvm::find_if(Srcs, MatchesSecond);
13868 if (Match != Srcs.end()) {
13869 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13870 } else
13871 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13872 return;
13873 }
13874 }
13875
13876 // If we have made it here, then we could not find a match in Src0s or Src1s
13877 // for either Src0 or Src1, so just place them arbitrarily.
13878
13879 unsigned ZeroMask = 0x0c0c0c0c;
13880 unsigned FMask = 0xFF << (8 * (3 - Step));
13881
13882 Src0s.push_back(
13883 {*Src0.Src,
13884 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13885 Src1.SrcOffset / 4});
13886 Src1s.push_back(
13887 {*Src1.Src,
13888 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13889 Src1.SrcOffset / 4});
13890
13891 return;
13892}
13893
13895 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13896 bool IsAny) {
13897
13898 // If we just have one source, just permute it accordingly.
13899 if (Srcs.size() == 1) {
13900 auto Elt = Srcs.begin();
13901 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13902
13903 // v_perm will produce the original value
13904 if (Elt->PermMask == 0x3020100)
13905 return EltOp;
13906
13907 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13908 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13909 }
13910
13911 auto FirstElt = Srcs.begin();
13912 auto SecondElt = std::next(FirstElt);
13913
13915
13916 // If we have multiple sources in the chain, combine them via perms (using
13917 // calculated perm mask) and Ors.
13918 while (true) {
13919 auto FirstMask = FirstElt->PermMask;
13920 auto SecondMask = SecondElt->PermMask;
13921
13922 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13923 unsigned FirstPlusFour = FirstMask | 0x04040404;
13924 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13925 // original 0x0C.
13926 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13927
13928 auto PermMask = addPermMasks(FirstMask, SecondMask);
13929 auto FirstVal =
13930 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13931 auto SecondVal =
13932 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13933
13934 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13935 SecondVal,
13936 DAG.getConstant(PermMask, SL, MVT::i32)));
13937
13938 FirstElt = std::next(SecondElt);
13939 if (FirstElt == Srcs.end())
13940 break;
13941
13942 SecondElt = std::next(FirstElt);
13943 // If we only have a FirstElt, then just combine that into the cumulative
13944 // source node.
13945 if (SecondElt == Srcs.end()) {
13946 auto EltOp =
13947 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13948
13949 Perms.push_back(
13950 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13951 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13952 break;
13953 }
13954 }
13955
13956 assert(Perms.size() == 1 || Perms.size() == 2);
13957 return Perms.size() == 2
13958 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13959 : Perms[0];
13960}
13961
13962static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13963 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13964 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13965 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13966 EntryMask += ZeroMask;
13967 }
13968}
13969
13970static bool isMul(const SDValue Op) {
13971 auto Opcode = Op.getOpcode();
13972
13973 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13974 Opcode == AMDGPUISD::MUL_I24);
13975}
13976
13977static std::optional<bool>
13979 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13980 const SDValue &S1Op, const SelectionDAG &DAG) {
13981 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13982 // of the dot4 is irrelevant.
13983 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13984 return false;
13985
13986 auto Known0 = DAG.computeKnownBits(S0Op, 0);
13987 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13988 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13989 auto Known1 = DAG.computeKnownBits(S1Op, 0);
13990 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13991 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13992
13993 assert(!(S0IsUnsigned && S0IsSigned));
13994 assert(!(S1IsUnsigned && S1IsSigned));
13995
13996 // There are 9 possible permutations of
13997 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
13998
13999 // In two permutations, the sign bits are known to be the same for both Ops,
14000 // so simply return Signed / Unsigned corresponding to the MSB
14001
14002 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14003 return S0IsSigned;
14004
14005 // In another two permutations, the sign bits are known to be opposite. In
14006 // this case return std::nullopt to indicate a bad match.
14007
14008 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14009 return std::nullopt;
14010
14011 // In the remaining five permutations, we don't know the value of the sign
14012 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14013 // the upper bits must be extension bits. Thus, the only ways for the sign
14014 // bit to be unknown is if it was sign extended from unknown value, or if it
14015 // was any extended. In either case, it is correct to use the signed
14016 // version of the signedness semantics of dot4
14017
14018 // In two of such permutations, we known the sign bit is set for
14019 // one op, and the other is unknown. It is okay to used signed version of
14020 // dot4.
14021 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14022 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14023 return true;
14024
14025 // In one such permutation, we don't know either of the sign bits. It is okay
14026 // to used the signed version of dot4.
14027 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14028 return true;
14029
14030 // In two of such permutations, we known the sign bit is unset for
14031 // one op, and the other is unknown. Return std::nullopt to indicate a
14032 // bad match.
14033 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14034 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14035 return std::nullopt;
14036
14037 llvm_unreachable("Fully covered condition");
14038}
14039
14040SDValue SITargetLowering::performAddCombine(SDNode *N,
14041 DAGCombinerInfo &DCI) const {
14042 SelectionDAG &DAG = DCI.DAG;
14043 EVT VT = N->getValueType(0);
14044 SDLoc SL(N);
14045 SDValue LHS = N->getOperand(0);
14046 SDValue RHS = N->getOperand(1);
14047
14048 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14049 if (Subtarget->hasMad64_32()) {
14050 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14051 return Folded;
14052 }
14053 }
14054
14055 if (SDValue V = reassociateScalarOps(N, DAG)) {
14056 return V;
14057 }
14058
14059 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14060 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14061 SDValue TempNode(N, 0);
14062 std::optional<bool> IsSigned;
14066
14067 // Match the v_dot4 tree, while collecting src nodes.
14068 int ChainLength = 0;
14069 for (int I = 0; I < 4; I++) {
14070 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14071 if (MulIdx == -1)
14072 break;
14073 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14074 if (!Src0)
14075 break;
14076 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14077 if (!Src1)
14078 break;
14079
14080 auto IterIsSigned = checkDot4MulSignedness(
14081 TempNode->getOperand(MulIdx), *Src0, *Src1,
14082 TempNode->getOperand(MulIdx)->getOperand(0),
14083 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14084 if (!IterIsSigned)
14085 break;
14086 if (!IsSigned)
14087 IsSigned = *IterIsSigned;
14088 if (*IterIsSigned != *IsSigned)
14089 break;
14090 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14091 auto AddIdx = 1 - MulIdx;
14092 // Allow the special case where add (add (mul24, 0), mul24) became ->
14093 // add (mul24, mul24).
14094 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14095 Src2s.push_back(TempNode->getOperand(AddIdx));
14096 auto Src0 =
14097 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14098 if (!Src0)
14099 break;
14100 auto Src1 =
14101 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14102 if (!Src1)
14103 break;
14104 auto IterIsSigned = checkDot4MulSignedness(
14105 TempNode->getOperand(AddIdx), *Src0, *Src1,
14106 TempNode->getOperand(AddIdx)->getOperand(0),
14107 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14108 if (!IterIsSigned)
14109 break;
14110 assert(IsSigned);
14111 if (*IterIsSigned != *IsSigned)
14112 break;
14113 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14114 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14115 ChainLength = I + 2;
14116 break;
14117 }
14118
14119 TempNode = TempNode->getOperand(AddIdx);
14120 Src2s.push_back(TempNode);
14121 ChainLength = I + 1;
14122 if (TempNode->getNumOperands() < 2)
14123 break;
14124 LHS = TempNode->getOperand(0);
14125 RHS = TempNode->getOperand(1);
14126 }
14127
14128 if (ChainLength < 2)
14129 return SDValue();
14130
14131 // Masks were constructed with assumption that we would find a chain of
14132 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14133 // 0x0c) so they do not affect dot calculation.
14134 if (ChainLength < 4) {
14135 fixMasks(Src0s, ChainLength);
14136 fixMasks(Src1s, ChainLength);
14137 }
14138
14139 SDValue Src0, Src1;
14140
14141 // If we are just using a single source for both, and have permuted the
14142 // bytes consistently, we can just use the sources without permuting
14143 // (commutation).
14144 bool UseOriginalSrc = false;
14145 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14146 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14147 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14148 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14149 SmallVector<unsigned, 4> SrcBytes;
14150 auto Src0Mask = Src0s.begin()->PermMask;
14151 SrcBytes.push_back(Src0Mask & 0xFF000000);
14152 bool UniqueEntries = true;
14153 for (auto I = 1; I < 4; I++) {
14154 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14155
14156 if (is_contained(SrcBytes, NextByte)) {
14157 UniqueEntries = false;
14158 break;
14159 }
14160 SrcBytes.push_back(NextByte);
14161 }
14162
14163 if (UniqueEntries) {
14164 UseOriginalSrc = true;
14165
14166 auto FirstElt = Src0s.begin();
14167 auto FirstEltOp =
14168 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14169
14170 auto SecondElt = Src1s.begin();
14171 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14172 SecondElt->DWordOffset);
14173
14174 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14175 MVT::getIntegerVT(32));
14176 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14177 MVT::getIntegerVT(32));
14178 }
14179 }
14180
14181 if (!UseOriginalSrc) {
14182 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14183 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14184 }
14185
14186 assert(IsSigned);
14187 SDValue Src2 =
14188 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14189
14190 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14191 : Intrinsic::amdgcn_udot4,
14192 SL, MVT::i64);
14193
14194 assert(!VT.isVector());
14195 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14196 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14197
14198 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14199 }
14200
14201 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14202 return SDValue();
14203
14204 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14205 // add x, sext (setcc) => usubo_carry x, 0, setcc
14206 unsigned Opc = LHS.getOpcode();
14207 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14208 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14209 std::swap(RHS, LHS);
14210
14211 Opc = RHS.getOpcode();
14212 switch (Opc) {
14213 default: break;
14214 case ISD::ZERO_EXTEND:
14215 case ISD::SIGN_EXTEND:
14216 case ISD::ANY_EXTEND: {
14217 auto Cond = RHS.getOperand(0);
14218 // If this won't be a real VOPC output, we would still need to insert an
14219 // extra instruction anyway.
14220 if (!isBoolSGPR(Cond))
14221 break;
14222 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14223 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14224 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
14225 return DAG.getNode(Opc, SL, VTList, Args);
14226 }
14227 case ISD::UADDO_CARRY: {
14228 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14229 if (!isNullConstant(RHS.getOperand(1)))
14230 break;
14231 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14232 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14233 }
14234 }
14235 return SDValue();
14236}
14237
14238SDValue SITargetLowering::performSubCombine(SDNode *N,
14239 DAGCombinerInfo &DCI) const {
14240 SelectionDAG &DAG = DCI.DAG;
14241 EVT VT = N->getValueType(0);
14242
14243 if (VT != MVT::i32)
14244 return SDValue();
14245
14246 SDLoc SL(N);
14247 SDValue LHS = N->getOperand(0);
14248 SDValue RHS = N->getOperand(1);
14249
14250 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14251 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14252 unsigned Opc = RHS.getOpcode();
14253 switch (Opc) {
14254 default: break;
14255 case ISD::ZERO_EXTEND:
14256 case ISD::SIGN_EXTEND:
14257 case ISD::ANY_EXTEND: {
14258 auto Cond = RHS.getOperand(0);
14259 // If this won't be a real VOPC output, we would still need to insert an
14260 // extra instruction anyway.
14261 if (!isBoolSGPR(Cond))
14262 break;
14263 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14264 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14265 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
14266 return DAG.getNode(Opc, SL, VTList, Args);
14267 }
14268 }
14269
14270 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14271 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14272 if (!isNullConstant(LHS.getOperand(1)))
14273 return SDValue();
14274 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14275 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14276 }
14277 return SDValue();
14278}
14279
14280SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14281 DAGCombinerInfo &DCI) const {
14282
14283 if (N->getValueType(0) != MVT::i32)
14284 return SDValue();
14285
14286 if (!isNullConstant(N->getOperand(1)))
14287 return SDValue();
14288
14289 SelectionDAG &DAG = DCI.DAG;
14290 SDValue LHS = N->getOperand(0);
14291
14292 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14293 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14294 unsigned LHSOpc = LHS.getOpcode();
14295 unsigned Opc = N->getOpcode();
14296 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14297 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14298 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14299 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14300 }
14301 return SDValue();
14302}
14303
14304SDValue SITargetLowering::performFAddCombine(SDNode *N,
14305 DAGCombinerInfo &DCI) const {
14306 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14307 return SDValue();
14308
14309 SelectionDAG &DAG = DCI.DAG;
14310 EVT VT = N->getValueType(0);
14311
14312 SDLoc SL(N);
14313 SDValue LHS = N->getOperand(0);
14314 SDValue RHS = N->getOperand(1);
14315
14316 // These should really be instruction patterns, but writing patterns with
14317 // source modifiers is a pain.
14318
14319 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14320 if (LHS.getOpcode() == ISD::FADD) {
14321 SDValue A = LHS.getOperand(0);
14322 if (A == LHS.getOperand(1)) {
14323 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14324 if (FusedOp != 0) {
14325 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14326 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14327 }
14328 }
14329 }
14330
14331 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14332 if (RHS.getOpcode() == ISD::FADD) {
14333 SDValue A = RHS.getOperand(0);
14334 if (A == RHS.getOperand(1)) {
14335 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14336 if (FusedOp != 0) {
14337 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14338 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14339 }
14340 }
14341 }
14342
14343 return SDValue();
14344}
14345
14346SDValue SITargetLowering::performFSubCombine(SDNode *N,
14347 DAGCombinerInfo &DCI) const {
14348 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14349 return SDValue();
14350
14351 SelectionDAG &DAG = DCI.DAG;
14352 SDLoc SL(N);
14353 EVT VT = N->getValueType(0);
14354 assert(!VT.isVector());
14355
14356 // Try to get the fneg to fold into the source modifier. This undoes generic
14357 // DAG combines and folds them into the mad.
14358 //
14359 // Only do this if we are not trying to support denormals. v_mad_f32 does
14360 // not support denormals ever.
14361 SDValue LHS = N->getOperand(0);
14362 SDValue RHS = N->getOperand(1);
14363 if (LHS.getOpcode() == ISD::FADD) {
14364 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14365 SDValue A = LHS.getOperand(0);
14366 if (A == LHS.getOperand(1)) {
14367 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14368 if (FusedOp != 0){
14369 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14370 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14371
14372 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14373 }
14374 }
14375 }
14376
14377 if (RHS.getOpcode() == ISD::FADD) {
14378 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14379
14380 SDValue A = RHS.getOperand(0);
14381 if (A == RHS.getOperand(1)) {
14382 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14383 if (FusedOp != 0){
14384 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14385 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14386 }
14387 }
14388 }
14389
14390 return SDValue();
14391}
14392
14393SDValue SITargetLowering::performFDivCombine(SDNode *N,
14394 DAGCombinerInfo &DCI) const {
14395 SelectionDAG &DAG = DCI.DAG;
14396 SDLoc SL(N);
14397 EVT VT = N->getValueType(0);
14398 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14399 return SDValue();
14400
14401 SDValue LHS = N->getOperand(0);
14402 SDValue RHS = N->getOperand(1);
14403
14404 SDNodeFlags Flags = N->getFlags();
14405 SDNodeFlags RHSFlags = RHS->getFlags();
14406 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14407 !RHS->hasOneUse())
14408 return SDValue();
14409
14410 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14411 bool IsNegative = false;
14412 if (CLHS->isExactlyValue(1.0) ||
14413 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14414 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14415 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14416 if (RHS.getOpcode() == ISD::FSQRT) {
14417 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14418 SDValue Rsq =
14419 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14420 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14421 }
14422 }
14423 }
14424
14425 return SDValue();
14426}
14427
14428SDValue SITargetLowering::performFMACombine(SDNode *N,
14429 DAGCombinerInfo &DCI) const {
14430 SelectionDAG &DAG = DCI.DAG;
14431 EVT VT = N->getValueType(0);
14432 SDLoc SL(N);
14433
14434 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14435 return SDValue();
14436
14437 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14438 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14439 SDValue Op1 = N->getOperand(0);
14440 SDValue Op2 = N->getOperand(1);
14441 SDValue FMA = N->getOperand(2);
14442
14443 if (FMA.getOpcode() != ISD::FMA ||
14444 Op1.getOpcode() != ISD::FP_EXTEND ||
14445 Op2.getOpcode() != ISD::FP_EXTEND)
14446 return SDValue();
14447
14448 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14449 // regardless of the denorm mode setting. Therefore,
14450 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14451 const TargetOptions &Options = DAG.getTarget().Options;
14452 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14453 (N->getFlags().hasAllowContract() &&
14454 FMA->getFlags().hasAllowContract())) {
14455 Op1 = Op1.getOperand(0);
14456 Op2 = Op2.getOperand(0);
14457 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14459 return SDValue();
14460
14461 SDValue Vec1 = Op1.getOperand(0);
14462 SDValue Idx1 = Op1.getOperand(1);
14463 SDValue Vec2 = Op2.getOperand(0);
14464
14465 SDValue FMAOp1 = FMA.getOperand(0);
14466 SDValue FMAOp2 = FMA.getOperand(1);
14467 SDValue FMAAcc = FMA.getOperand(2);
14468
14469 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14470 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14471 return SDValue();
14472
14473 FMAOp1 = FMAOp1.getOperand(0);
14474 FMAOp2 = FMAOp2.getOperand(0);
14475 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14477 return SDValue();
14478
14479 SDValue Vec3 = FMAOp1.getOperand(0);
14480 SDValue Vec4 = FMAOp2.getOperand(0);
14481 SDValue Idx2 = FMAOp1.getOperand(1);
14482
14483 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14484 // Idx1 and Idx2 cannot be the same.
14485 Idx1 == Idx2)
14486 return SDValue();
14487
14488 if (Vec1 == Vec2 || Vec3 == Vec4)
14489 return SDValue();
14490
14491 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14492 return SDValue();
14493
14494 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14495 (Vec1 == Vec4 && Vec2 == Vec3)) {
14496 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14497 DAG.getTargetConstant(0, SL, MVT::i1));
14498 }
14499 }
14500 return SDValue();
14501}
14502
14503SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14504 DAGCombinerInfo &DCI) const {
14505 SelectionDAG &DAG = DCI.DAG;
14506 SDLoc SL(N);
14507
14508 SDValue LHS = N->getOperand(0);
14509 SDValue RHS = N->getOperand(1);
14510 EVT VT = LHS.getValueType();
14511 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14512
14513 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14514 if (!CRHS) {
14515 CRHS = dyn_cast<ConstantSDNode>(LHS);
14516 if (CRHS) {
14517 std::swap(LHS, RHS);
14519 }
14520 }
14521
14522 if (CRHS) {
14523 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14524 isBoolSGPR(LHS.getOperand(0))) {
14525 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14526 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14527 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14528 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14529 if ((CRHS->isAllOnes() &&
14530 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14531 (CRHS->isZero() &&
14532 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14533 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14534 DAG.getConstant(-1, SL, MVT::i1));
14535 if ((CRHS->isAllOnes() &&
14536 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14537 (CRHS->isZero() &&
14538 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14539 return LHS.getOperand(0);
14540 }
14541
14542 const APInt &CRHSVal = CRHS->getAPIntValue();
14543 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14544 LHS.getOpcode() == ISD::SELECT &&
14545 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14546 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14547 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14548 isBoolSGPR(LHS.getOperand(0))) {
14549 // Given CT != FT:
14550 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14551 // setcc (select cc, CT, CF), CF, ne => cc
14552 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14553 // setcc (select cc, CT, CF), CT, eq => cc
14554 const APInt &CT = LHS.getConstantOperandAPInt(1);
14555 const APInt &CF = LHS.getConstantOperandAPInt(2);
14556
14557 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14558 (CT == CRHSVal && CC == ISD::SETNE))
14559 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14560 DAG.getConstant(-1, SL, MVT::i1));
14561 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14562 (CT == CRHSVal && CC == ISD::SETEQ))
14563 return LHS.getOperand(0);
14564 }
14565 }
14566
14567 if (VT != MVT::f32 && VT != MVT::f64 &&
14568 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14569 return SDValue();
14570
14571 // Match isinf/isfinite pattern
14572 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14573 // (fcmp one (fabs x), inf) -> (fp_class x,
14574 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14575 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14577 if (!CRHS)
14578 return SDValue();
14579
14580 const APFloat &APF = CRHS->getValueAPF();
14581 if (APF.isInfinity() && !APF.isNegative()) {
14582 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14584 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14590 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14591 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14592 DAG.getConstant(Mask, SL, MVT::i32));
14593 }
14594 }
14595
14596 return SDValue();
14597}
14598
14599SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14600 DAGCombinerInfo &DCI) const {
14601 SelectionDAG &DAG = DCI.DAG;
14602 SDLoc SL(N);
14603 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14604
14605 SDValue Src = N->getOperand(0);
14606 SDValue Shift = N->getOperand(0);
14607
14608 // TODO: Extend type shouldn't matter (assuming legal types).
14609 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14610 Shift = Shift.getOperand(0);
14611
14612 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14613 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14614 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14615 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14616 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14617 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14618 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14619 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14620 SDLoc(Shift.getOperand(0)), MVT::i32);
14621
14622 unsigned ShiftOffset = 8 * Offset;
14623 if (Shift.getOpcode() == ISD::SHL)
14624 ShiftOffset -= C->getZExtValue();
14625 else
14626 ShiftOffset += C->getZExtValue();
14627
14628 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14629 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14630 MVT::f32, Shifted);
14631 }
14632 }
14633 }
14634
14635 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14636 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14637 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14638 // We simplified Src. If this node is not dead, visit it again so it is
14639 // folded properly.
14640 if (N->getOpcode() != ISD::DELETED_NODE)
14641 DCI.AddToWorklist(N);
14642 return SDValue(N, 0);
14643 }
14644
14645 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14646 if (SDValue DemandedSrc =
14648 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14649
14650 return SDValue();
14651}
14652
14653SDValue SITargetLowering::performClampCombine(SDNode *N,
14654 DAGCombinerInfo &DCI) const {
14655 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14656 if (!CSrc)
14657 return SDValue();
14658
14659 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14660 const APFloat &F = CSrc->getValueAPF();
14661 APFloat Zero = APFloat::getZero(F.getSemantics());
14662 if (F < Zero ||
14663 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14664 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14665 }
14666
14667 APFloat One(F.getSemantics(), "1.0");
14668 if (F > One)
14669 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14670
14671 return SDValue(CSrc, 0);
14672}
14673
14674
14676 DAGCombinerInfo &DCI) const {
14677 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14678 return SDValue();
14679 switch (N->getOpcode()) {
14680 case ISD::ADD:
14681 return performAddCombine(N, DCI);
14682 case ISD::SUB:
14683 return performSubCombine(N, DCI);
14684 case ISD::UADDO_CARRY:
14685 case ISD::USUBO_CARRY:
14686 return performAddCarrySubCarryCombine(N, DCI);
14687 case ISD::FADD:
14688 return performFAddCombine(N, DCI);
14689 case ISD::FSUB:
14690 return performFSubCombine(N, DCI);
14691 case ISD::FDIV:
14692 return performFDivCombine(N, DCI);
14693 case ISD::SETCC:
14694 return performSetCCCombine(N, DCI);
14695 case ISD::FMAXNUM:
14696 case ISD::FMINNUM:
14697 case ISD::FMAXNUM_IEEE:
14698 case ISD::FMINNUM_IEEE:
14699 case ISD::FMAXIMUM:
14700 case ISD::FMINIMUM:
14701 case ISD::SMAX:
14702 case ISD::SMIN:
14703 case ISD::UMAX:
14704 case ISD::UMIN:
14707 return performMinMaxCombine(N, DCI);
14708 case ISD::FMA:
14709 return performFMACombine(N, DCI);
14710 case ISD::AND:
14711 return performAndCombine(N, DCI);
14712 case ISD::OR:
14713 return performOrCombine(N, DCI);
14714 case ISD::FSHR: {
14716 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14717 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14718 return matchPERM(N, DCI);
14719 }
14720 break;
14721 }
14722 case ISD::XOR:
14723 return performXorCombine(N, DCI);
14724 case ISD::ZERO_EXTEND:
14725 return performZeroExtendCombine(N, DCI);
14727 return performSignExtendInRegCombine(N , DCI);
14729 return performClassCombine(N, DCI);
14730 case ISD::FCANONICALIZE:
14731 return performFCanonicalizeCombine(N, DCI);
14732 case AMDGPUISD::RCP:
14733 return performRcpCombine(N, DCI);
14734 case ISD::FLDEXP:
14735 case AMDGPUISD::FRACT:
14736 case AMDGPUISD::RSQ:
14739 case AMDGPUISD::RSQ_CLAMP: {
14740 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14741 SDValue Src = N->getOperand(0);
14742 if (Src.isUndef())
14743 return Src;
14744 break;
14745 }
14746 case ISD::SINT_TO_FP:
14747 case ISD::UINT_TO_FP:
14748 return performUCharToFloatCombine(N, DCI);
14749 case ISD::FCOPYSIGN:
14750 return performFCopySignCombine(N, DCI);
14755 return performCvtF32UByteNCombine(N, DCI);
14756 case AMDGPUISD::FMED3:
14757 return performFMed3Combine(N, DCI);
14759 return performCvtPkRTZCombine(N, DCI);
14760 case AMDGPUISD::CLAMP:
14761 return performClampCombine(N, DCI);
14762 case ISD::SCALAR_TO_VECTOR: {
14763 SelectionDAG &DAG = DCI.DAG;
14764 EVT VT = N->getValueType(0);
14765
14766 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14767 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14768 SDLoc SL(N);
14769 SDValue Src = N->getOperand(0);
14770 EVT EltVT = Src.getValueType();
14771 if (EltVT != MVT::i16)
14772 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14773
14774 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14775 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14776 }
14777
14778 break;
14779 }
14781 return performExtractVectorEltCombine(N, DCI);
14783 return performInsertVectorEltCombine(N, DCI);
14784 case ISD::FP_ROUND:
14785 return performFPRoundCombine(N, DCI);
14786 case ISD::LOAD: {
14787 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14788 return Widened;
14789 [[fallthrough]];
14790 }
14791 default: {
14792 if (!DCI.isBeforeLegalize()) {
14793 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14794 return performMemSDNodeCombine(MemNode, DCI);
14795 }
14796
14797 break;
14798 }
14799 }
14800
14802}
14803
14804/// Helper function for adjustWritemask
14805static unsigned SubIdx2Lane(unsigned Idx) {
14806 switch (Idx) {
14807 default: return ~0u;
14808 case AMDGPU::sub0: return 0;
14809 case AMDGPU::sub1: return 1;
14810 case AMDGPU::sub2: return 2;
14811 case AMDGPU::sub3: return 3;
14812 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14813 }
14814}
14815
14816/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14817SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14818 SelectionDAG &DAG) const {
14819 unsigned Opcode = Node->getMachineOpcode();
14820
14821 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14822 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14823 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14824 return Node; // not implemented for D16
14825
14826 SDNode *Users[5] = { nullptr };
14827 unsigned Lane = 0;
14828 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14829 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14830 unsigned NewDmask = 0;
14831 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14832 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14833 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14834 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14835 ? true
14836 : false;
14837 unsigned TFCLane = 0;
14838 bool HasChain = Node->getNumValues() > 1;
14839
14840 if (OldDmask == 0) {
14841 // These are folded out, but on the chance it happens don't assert.
14842 return Node;
14843 }
14844
14845 unsigned OldBitsSet = llvm::popcount(OldDmask);
14846 // Work out which is the TFE/LWE lane if that is enabled.
14847 if (UsesTFC) {
14848 TFCLane = OldBitsSet;
14849 }
14850
14851 // Try to figure out the used register components
14852 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14853 I != E; ++I) {
14854
14855 // Don't look at users of the chain.
14856 if (I.getUse().getResNo() != 0)
14857 continue;
14858
14859 // Abort if we can't understand the usage
14860 if (!I->isMachineOpcode() ||
14861 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14862 return Node;
14863
14864 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14865 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14866 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14867 // set, etc.
14868 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14869 if (Lane == ~0u)
14870 return Node;
14871
14872 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14873 if (UsesTFC && Lane == TFCLane) {
14874 Users[Lane] = *I;
14875 } else {
14876 // Set which texture component corresponds to the lane.
14877 unsigned Comp;
14878 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14879 Comp = llvm::countr_zero(Dmask);
14880 Dmask &= ~(1 << Comp);
14881 }
14882
14883 // Abort if we have more than one user per component.
14884 if (Users[Lane])
14885 return Node;
14886
14887 Users[Lane] = *I;
14888 NewDmask |= 1 << Comp;
14889 }
14890 }
14891
14892 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14893 bool NoChannels = !NewDmask;
14894 if (NoChannels) {
14895 if (!UsesTFC) {
14896 // No uses of the result and not using TFC. Then do nothing.
14897 return Node;
14898 }
14899 // If the original dmask has one channel - then nothing to do
14900 if (OldBitsSet == 1)
14901 return Node;
14902 // Use an arbitrary dmask - required for the instruction to work
14903 NewDmask = 1;
14904 }
14905 // Abort if there's no change
14906 if (NewDmask == OldDmask)
14907 return Node;
14908
14909 unsigned BitsSet = llvm::popcount(NewDmask);
14910
14911 // Check for TFE or LWE - increase the number of channels by one to account
14912 // for the extra return value
14913 // This will need adjustment for D16 if this is also included in
14914 // adjustWriteMask (this function) but at present D16 are excluded.
14915 unsigned NewChannels = BitsSet + UsesTFC;
14916
14917 int NewOpcode =
14918 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14919 assert(NewOpcode != -1 &&
14920 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14921 "failed to find equivalent MIMG op");
14922
14923 // Adjust the writemask in the node
14925 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14926 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14927 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14928
14929 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14930
14931 MVT ResultVT = NewChannels == 1 ?
14932 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14933 NewChannels == 5 ? 8 : NewChannels);
14934 SDVTList NewVTList = HasChain ?
14935 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14936
14937
14938 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14939 NewVTList, Ops);
14940
14941 if (HasChain) {
14942 // Update chain.
14943 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14944 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14945 }
14946
14947 if (NewChannels == 1) {
14948 assert(Node->hasNUsesOfValue(1, 0));
14949 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14950 SDLoc(Node), Users[Lane]->getValueType(0),
14951 SDValue(NewNode, 0));
14952 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14953 return nullptr;
14954 }
14955
14956 // Update the users of the node with the new indices
14957 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14958 SDNode *User = Users[i];
14959 if (!User) {
14960 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14961 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14962 if (i || !NoChannels)
14963 continue;
14964 } else {
14965 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14966 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14967 if (NewUser != User) {
14968 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14969 DAG.RemoveDeadNode(User);
14970 }
14971 }
14972
14973 switch (Idx) {
14974 default: break;
14975 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14976 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14977 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14978 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14979 }
14980 }
14981
14982 DAG.RemoveDeadNode(Node);
14983 return nullptr;
14984}
14985
14987 if (Op.getOpcode() == ISD::AssertZext)
14988 Op = Op.getOperand(0);
14989
14990 return isa<FrameIndexSDNode>(Op);
14991}
14992
14993/// Legalize target independent instructions (e.g. INSERT_SUBREG)
14994/// with frame index operands.
14995/// LLVM assumes that inputs are to these instructions are registers.
14997 SelectionDAG &DAG) const {
14998 if (Node->getOpcode() == ISD::CopyToReg) {
14999 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15000 SDValue SrcVal = Node->getOperand(2);
15001
15002 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15003 // to try understanding copies to physical registers.
15004 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15005 SDLoc SL(Node);
15007 SDValue VReg = DAG.getRegister(
15008 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15009
15010 SDNode *Glued = Node->getGluedNode();
15011 SDValue ToVReg
15012 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15013 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15014 SDValue ToResultReg
15015 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15016 VReg, ToVReg.getValue(1));
15017 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15018 DAG.RemoveDeadNode(Node);
15019 return ToResultReg.getNode();
15020 }
15021 }
15022
15024 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15025 if (!isFrameIndexOp(Node->getOperand(i))) {
15026 Ops.push_back(Node->getOperand(i));
15027 continue;
15028 }
15029
15030 SDLoc DL(Node);
15031 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15032 Node->getOperand(i).getValueType(),
15033 Node->getOperand(i)), 0));
15034 }
15035
15036 return DAG.UpdateNodeOperands(Node, Ops);
15037}
15038
15039/// Fold the instructions after selecting them.
15040/// Returns null if users were already updated.
15042 SelectionDAG &DAG) const {
15044 unsigned Opcode = Node->getMachineOpcode();
15045
15046 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15047 !TII->isGather4(Opcode) &&
15048 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15049 return adjustWritemask(Node, DAG);
15050 }
15051
15052 if (Opcode == AMDGPU::INSERT_SUBREG ||
15053 Opcode == AMDGPU::REG_SEQUENCE) {
15055 return Node;
15056 }
15057
15058 switch (Opcode) {
15059 case AMDGPU::V_DIV_SCALE_F32_e64:
15060 case AMDGPU::V_DIV_SCALE_F64_e64: {
15061 // Satisfy the operand register constraint when one of the inputs is
15062 // undefined. Ordinarily each undef value will have its own implicit_def of
15063 // a vreg, so force these to use a single register.
15064 SDValue Src0 = Node->getOperand(1);
15065 SDValue Src1 = Node->getOperand(3);
15066 SDValue Src2 = Node->getOperand(5);
15067
15068 if ((Src0.isMachineOpcode() &&
15069 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15070 (Src0 == Src1 || Src0 == Src2))
15071 break;
15072
15073 MVT VT = Src0.getValueType().getSimpleVT();
15074 const TargetRegisterClass *RC =
15075 getRegClassFor(VT, Src0.getNode()->isDivergent());
15076
15078 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15079
15080 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
15081 UndefReg, Src0, SDValue());
15082
15083 // src0 must be the same register as src1 or src2, even if the value is
15084 // undefined, so make sure we don't violate this constraint.
15085 if (Src0.isMachineOpcode() &&
15086 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15087 if (Src1.isMachineOpcode() &&
15088 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15089 Src0 = Src1;
15090 else if (Src2.isMachineOpcode() &&
15091 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15092 Src0 = Src2;
15093 else {
15094 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15095 Src0 = UndefReg;
15096 Src1 = UndefReg;
15097 }
15098 } else
15099 break;
15100
15101 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15102 Ops[1] = Src0;
15103 Ops[3] = Src1;
15104 Ops[5] = Src2;
15105 Ops.push_back(ImpDef.getValue(1));
15106 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15107 }
15108 default:
15109 break;
15110 }
15111
15112 return Node;
15113}
15114
15115// Any MIMG instructions that use tfe or lwe require an initialization of the
15116// result register that will be written in the case of a memory access failure.
15117// The required code is also added to tie this init code to the result of the
15118// img instruction.
15121 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15122 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15123 MachineBasicBlock &MBB = *MI.getParent();
15124
15125 int DstIdx =
15126 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15127 unsigned InitIdx = 0;
15128
15129 if (TII->isImage(MI)) {
15130 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15131 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15132 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15133
15134 if (!TFE && !LWE) // intersect_ray
15135 return;
15136
15137 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15138 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15139 unsigned D16Val = D16 ? D16->getImm() : 0;
15140
15141 if (!TFEVal && !LWEVal)
15142 return;
15143
15144 // At least one of TFE or LWE are non-zero
15145 // We have to insert a suitable initialization of the result value and
15146 // tie this to the dest of the image instruction.
15147
15148 // Calculate which dword we have to initialize to 0.
15149 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15150
15151 // check that dmask operand is found.
15152 assert(MO_Dmask && "Expected dmask operand in instruction");
15153
15154 unsigned dmask = MO_Dmask->getImm();
15155 // Determine the number of active lanes taking into account the
15156 // Gather4 special case
15157 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15158
15159 bool Packed = !Subtarget->hasUnpackedD16VMem();
15160
15161 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15162
15163 // Abandon attempt if the dst size isn't large enough
15164 // - this is in fact an error but this is picked up elsewhere and
15165 // reported correctly.
15166 uint32_t DstSize =
15167 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15168 if (DstSize < InitIdx)
15169 return;
15170 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15171 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15172 } else {
15173 return;
15174 }
15175
15176 const DebugLoc &DL = MI.getDebugLoc();
15177
15178 // Create a register for the initialization value.
15179 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15180 unsigned NewDst = 0; // Final initialized value will be in here
15181
15182 // If PRTStrictNull feature is enabled (the default) then initialize
15183 // all the result registers to 0, otherwise just the error indication
15184 // register (VGPRn+1)
15185 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15186 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15187
15188 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15189 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15190 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15191 // Initialize dword
15192 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15193 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15194 .addImm(0);
15195 // Insert into the super-reg
15196 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15197 .addReg(PrevDst)
15198 .addReg(SubReg)
15200
15201 PrevDst = NewDst;
15202 }
15203
15204 // Add as an implicit operand
15205 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15206
15207 // Tie the just added implicit operand to the dst
15208 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15209}
15210
15211/// Assign the register class depending on the number of
15212/// bits set in the writemask
15214 SDNode *Node) const {
15216
15217 MachineFunction *MF = MI.getParent()->getParent();
15220
15221 if (TII->isVOP3(MI.getOpcode())) {
15222 // Make sure constant bus requirements are respected.
15223 TII->legalizeOperandsVOP3(MRI, MI);
15224
15225 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15226 // This saves a chain-copy of registers and better balance register
15227 // use between vgpr and agpr as agpr tuples tend to be big.
15228 if (!MI.getDesc().operands().empty()) {
15229 unsigned Opc = MI.getOpcode();
15230 bool HasAGPRs = Info->mayNeedAGPRs();
15231 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15232 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15233 for (auto I :
15234 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15235 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15236 if (I == -1)
15237 break;
15238 if ((I == Src2Idx) && (HasAGPRs))
15239 break;
15240 MachineOperand &Op = MI.getOperand(I);
15241 if (!Op.isReg() || !Op.getReg().isVirtual())
15242 continue;
15243 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15244 if (!TRI->hasAGPRs(RC))
15245 continue;
15246 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15247 if (!Src || !Src->isCopy() ||
15248 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15249 continue;
15250 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15251 // All uses of agpr64 and agpr32 can also accept vgpr except for
15252 // v_accvgpr_read, but we do not produce agpr reads during selection,
15253 // so no use checks are needed.
15254 MRI.setRegClass(Op.getReg(), NewRC);
15255 }
15256
15257 if (!HasAGPRs)
15258 return;
15259
15260 // Resolve the rest of AV operands to AGPRs.
15261 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15262 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15263 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15264 if (TRI->isVectorSuperClass(RC)) {
15265 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15266 MRI.setRegClass(Src2->getReg(), NewRC);
15267 if (Src2->isTied())
15268 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15269 }
15270 }
15271 }
15272 }
15273
15274 return;
15275 }
15276
15277 if (TII->isImage(MI))
15278 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15279}
15280
15282 uint64_t Val) {
15283 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15284 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15285}
15286
15288 const SDLoc &DL,
15289 SDValue Ptr) const {
15291
15292 // Build the half of the subregister with the constants before building the
15293 // full 128-bit register. If we are building multiple resource descriptors,
15294 // this will allow CSEing of the 2-component register.
15295 const SDValue Ops0[] = {
15296 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15297 buildSMovImm32(DAG, DL, 0),
15298 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15299 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15300 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15301 };
15302
15303 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15304 MVT::v2i32, Ops0), 0);
15305
15306 // Combine the constants and the pointer.
15307 const SDValue Ops1[] = {
15308 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15309 Ptr,
15310 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15311 SubRegHi,
15312 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15313 };
15314
15315 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15316}
15317
15318/// Return a resource descriptor with the 'Add TID' bit enabled
15319/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15320/// of the resource descriptor) to create an offset, which is added to
15321/// the resource pointer.
15323 SDValue Ptr, uint32_t RsrcDword1,
15324 uint64_t RsrcDword2And3) const {
15325 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15326 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15327 if (RsrcDword1) {
15328 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15329 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15330 0);
15331 }
15332
15333 SDValue DataLo = buildSMovImm32(DAG, DL,
15334 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15335 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15336
15337 const SDValue Ops[] = {
15338 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15339 PtrLo,
15340 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15341 PtrHi,
15342 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15343 DataLo,
15344 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15345 DataHi,
15346 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15347 };
15348
15349 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15350}
15351
15352//===----------------------------------------------------------------------===//
15353// SI Inline Assembly Support
15354//===----------------------------------------------------------------------===//
15355
15356std::pair<unsigned, const TargetRegisterClass *>
15358 StringRef Constraint,
15359 MVT VT) const {
15360 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15361
15362 const TargetRegisterClass *RC = nullptr;
15363 if (Constraint.size() == 1) {
15364 const unsigned BitWidth = VT.getSizeInBits();
15365 switch (Constraint[0]) {
15366 default:
15367 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15368 case 's':
15369 case 'r':
15370 switch (BitWidth) {
15371 case 16:
15372 RC = &AMDGPU::SReg_32RegClass;
15373 break;
15374 case 64:
15375 RC = &AMDGPU::SGPR_64RegClass;
15376 break;
15377 default:
15379 if (!RC)
15380 return std::pair(0U, nullptr);
15381 break;
15382 }
15383 break;
15384 case 'v':
15385 switch (BitWidth) {
15386 case 16:
15387 RC = &AMDGPU::VGPR_32RegClass;
15388 break;
15389 default:
15390 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15391 if (!RC)
15392 return std::pair(0U, nullptr);
15393 break;
15394 }
15395 break;
15396 case 'a':
15397 if (!Subtarget->hasMAIInsts())
15398 break;
15399 switch (BitWidth) {
15400 case 16:
15401 RC = &AMDGPU::AGPR_32RegClass;
15402 break;
15403 default:
15404 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15405 if (!RC)
15406 return std::pair(0U, nullptr);
15407 break;
15408 }
15409 break;
15410 }
15411 // We actually support i128, i16 and f16 as inline parameters
15412 // even if they are not reported as legal
15413 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15414 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15415 return std::pair(0U, RC);
15416 }
15417
15418 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15419 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15420 if (RegName.consume_front("v")) {
15421 RC = &AMDGPU::VGPR_32RegClass;
15422 } else if (RegName.consume_front("s")) {
15423 RC = &AMDGPU::SGPR_32RegClass;
15424 } else if (RegName.consume_front("a")) {
15425 RC = &AMDGPU::AGPR_32RegClass;
15426 }
15427
15428 if (RC) {
15429 uint32_t Idx;
15430 if (RegName.consume_front("[")) {
15431 uint32_t End;
15432 bool Failed = RegName.consumeInteger(10, Idx);
15433 Failed |= !RegName.consume_front(":");
15434 Failed |= RegName.consumeInteger(10, End);
15435 Failed |= !RegName.consume_back("]");
15436 if (!Failed) {
15437 uint32_t Width = (End - Idx + 1) * 32;
15438 MCRegister Reg = RC->getRegister(Idx);
15440 RC = TRI->getVGPRClassForBitWidth(Width);
15441 else if (SIRegisterInfo::isSGPRClass(RC))
15442 RC = TRI->getSGPRClassForBitWidth(Width);
15443 else if (SIRegisterInfo::isAGPRClass(RC))
15444 RC = TRI->getAGPRClassForBitWidth(Width);
15445 if (RC) {
15446 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15447 return std::pair(Reg, RC);
15448 }
15449 }
15450 } else {
15451 bool Failed = RegName.getAsInteger(10, Idx);
15452 if (!Failed && Idx < RC->getNumRegs())
15453 return std::pair(RC->getRegister(Idx), RC);
15454 }
15455 }
15456 }
15457
15458 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15459 if (Ret.first)
15460 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15461
15462 return Ret;
15463}
15464
15465static bool isImmConstraint(StringRef Constraint) {
15466 if (Constraint.size() == 1) {
15467 switch (Constraint[0]) {
15468 default: break;
15469 case 'I':
15470 case 'J':
15471 case 'A':
15472 case 'B':
15473 case 'C':
15474 return true;
15475 }
15476 } else if (Constraint == "DA" ||
15477 Constraint == "DB") {
15478 return true;
15479 }
15480 return false;
15481}
15482
15485 if (Constraint.size() == 1) {
15486 switch (Constraint[0]) {
15487 default: break;
15488 case 's':
15489 case 'v':
15490 case 'a':
15491 return C_RegisterClass;
15492 }
15493 }
15494 if (isImmConstraint(Constraint)) {
15495 return C_Other;
15496 }
15497 return TargetLowering::getConstraintType(Constraint);
15498}
15499
15500static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15502 Val = Val & maskTrailingOnes<uint64_t>(Size);
15503 }
15504 return Val;
15505}
15506
15508 StringRef Constraint,
15509 std::vector<SDValue> &Ops,
15510 SelectionDAG &DAG) const {
15511 if (isImmConstraint(Constraint)) {
15512 uint64_t Val;
15513 if (getAsmOperandConstVal(Op, Val) &&
15514 checkAsmConstraintVal(Op, Constraint, Val)) {
15515 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15516 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15517 }
15518 } else {
15519 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15520 }
15521}
15522
15524 unsigned Size = Op.getScalarValueSizeInBits();
15525 if (Size > 64)
15526 return false;
15527
15528 if (Size == 16 && !Subtarget->has16BitInsts())
15529 return false;
15530
15532 Val = C->getSExtValue();
15533 return true;
15534 }
15536 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15537 return true;
15538 }
15540 if (Size != 16 || Op.getNumOperands() != 2)
15541 return false;
15542 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15543 return false;
15544 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15545 Val = C->getSExtValue();
15546 return true;
15547 }
15548 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15549 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15550 return true;
15551 }
15552 }
15553
15554 return false;
15555}
15556
15558 uint64_t Val) const {
15559 if (Constraint.size() == 1) {
15560 switch (Constraint[0]) {
15561 case 'I':
15563 case 'J':
15564 return isInt<16>(Val);
15565 case 'A':
15566 return checkAsmConstraintValA(Op, Val);
15567 case 'B':
15568 return isInt<32>(Val);
15569 case 'C':
15570 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15572 default:
15573 break;
15574 }
15575 } else if (Constraint.size() == 2) {
15576 if (Constraint == "DA") {
15577 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15578 int64_t LoBits = static_cast<int32_t>(Val);
15579 return checkAsmConstraintValA(Op, HiBits, 32) &&
15580 checkAsmConstraintValA(Op, LoBits, 32);
15581 }
15582 if (Constraint == "DB") {
15583 return true;
15584 }
15585 }
15586 llvm_unreachable("Invalid asm constraint");
15587}
15588
15590 unsigned MaxSize) const {
15591 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15592 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15593 if (Size == 16) {
15594 MVT VT = Op.getSimpleValueType();
15595 switch (VT.SimpleTy) {
15596 default:
15597 return false;
15598 case MVT::i16:
15599 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15600 case MVT::f16:
15601 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15602 case MVT::bf16:
15603 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15604 case MVT::v2i16:
15605 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15606 case MVT::v2f16:
15607 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15608 case MVT::v2bf16:
15609 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15610 }
15611 }
15612 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15613 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15614 return true;
15615 return false;
15616}
15617
15618static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15619 switch (UnalignedClassID) {
15620 case AMDGPU::VReg_64RegClassID:
15621 return AMDGPU::VReg_64_Align2RegClassID;
15622 case AMDGPU::VReg_96RegClassID:
15623 return AMDGPU::VReg_96_Align2RegClassID;
15624 case AMDGPU::VReg_128RegClassID:
15625 return AMDGPU::VReg_128_Align2RegClassID;
15626 case AMDGPU::VReg_160RegClassID:
15627 return AMDGPU::VReg_160_Align2RegClassID;
15628 case AMDGPU::VReg_192RegClassID:
15629 return AMDGPU::VReg_192_Align2RegClassID;
15630 case AMDGPU::VReg_224RegClassID:
15631 return AMDGPU::VReg_224_Align2RegClassID;
15632 case AMDGPU::VReg_256RegClassID:
15633 return AMDGPU::VReg_256_Align2RegClassID;
15634 case AMDGPU::VReg_288RegClassID:
15635 return AMDGPU::VReg_288_Align2RegClassID;
15636 case AMDGPU::VReg_320RegClassID:
15637 return AMDGPU::VReg_320_Align2RegClassID;
15638 case AMDGPU::VReg_352RegClassID:
15639 return AMDGPU::VReg_352_Align2RegClassID;
15640 case AMDGPU::VReg_384RegClassID:
15641 return AMDGPU::VReg_384_Align2RegClassID;
15642 case AMDGPU::VReg_512RegClassID:
15643 return AMDGPU::VReg_512_Align2RegClassID;
15644 case AMDGPU::VReg_1024RegClassID:
15645 return AMDGPU::VReg_1024_Align2RegClassID;
15646 case AMDGPU::AReg_64RegClassID:
15647 return AMDGPU::AReg_64_Align2RegClassID;
15648 case AMDGPU::AReg_96RegClassID:
15649 return AMDGPU::AReg_96_Align2RegClassID;
15650 case AMDGPU::AReg_128RegClassID:
15651 return AMDGPU::AReg_128_Align2RegClassID;
15652 case AMDGPU::AReg_160RegClassID:
15653 return AMDGPU::AReg_160_Align2RegClassID;
15654 case AMDGPU::AReg_192RegClassID:
15655 return AMDGPU::AReg_192_Align2RegClassID;
15656 case AMDGPU::AReg_256RegClassID:
15657 return AMDGPU::AReg_256_Align2RegClassID;
15658 case AMDGPU::AReg_512RegClassID:
15659 return AMDGPU::AReg_512_Align2RegClassID;
15660 case AMDGPU::AReg_1024RegClassID:
15661 return AMDGPU::AReg_1024_Align2RegClassID;
15662 default:
15663 return -1;
15664 }
15665}
15666
15667// Figure out which registers should be reserved for stack access. Only after
15668// the function is legalized do we know all of the non-spill stack objects or if
15669// calls are present.
15673 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15674 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15675 const SIInstrInfo *TII = ST.getInstrInfo();
15676
15677 if (Info->isEntryFunction()) {
15678 // Callable functions have fixed registers used for stack access.
15680 }
15681
15682 // TODO: Move this logic to getReservedRegs()
15683 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15684 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15685 Register SReg = ST.isWave32()
15686 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15687 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15688 &AMDGPU::SGPR_64RegClass);
15689 Info->setSGPRForEXECCopy(SReg);
15690
15691 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15692 Info->getStackPtrOffsetReg()));
15693 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15694 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15695
15696 // We need to worry about replacing the default register with itself in case
15697 // of MIR testcases missing the MFI.
15698 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15699 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15700
15701 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15702 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15703
15704 Info->limitOccupancy(MF);
15705
15706 if (ST.isWave32() && !MF.empty()) {
15707 for (auto &MBB : MF) {
15708 for (auto &MI : MBB) {
15709 TII->fixImplicitOperands(MI);
15710 }
15711 }
15712 }
15713
15714 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15715 // classes if required. Ideally the register class constraints would differ
15716 // per-subtarget, but there's no easy way to achieve that right now. This is
15717 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15718 // from using them as the register class for legal types.
15719 if (ST.needsAlignedVGPRs()) {
15720 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15721 const Register Reg = Register::index2VirtReg(I);
15722 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15723 if (!RC)
15724 continue;
15725 int NewClassID = getAlignedAGPRClassID(RC->getID());
15726 if (NewClassID != -1)
15727 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15728 }
15729 }
15730
15732}
15733
15735 KnownBits &Known,
15736 const APInt &DemandedElts,
15737 const SelectionDAG &DAG,
15738 unsigned Depth) const {
15739 Known.resetAll();
15740 unsigned Opc = Op.getOpcode();
15741 switch (Opc) {
15743 unsigned IID = Op.getConstantOperandVal(0);
15744 switch (IID) {
15745 case Intrinsic::amdgcn_mbcnt_lo:
15746 case Intrinsic::amdgcn_mbcnt_hi: {
15747 const GCNSubtarget &ST =
15749 // These return at most the (wavefront size - 1) + src1
15750 // As long as src1 is an immediate we can calc known bits
15751 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15752 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15753 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15754 // Cater for potential carry
15755 MaxActiveBits += Src1ValBits ? 1 : 0;
15756 unsigned Size = Op.getValueType().getSizeInBits();
15757 if (MaxActiveBits < Size)
15758 Known.Zero.setHighBits(Size - MaxActiveBits);
15759 return;
15760 }
15761 }
15762 break;
15763 }
15764 }
15766 Op, Known, DemandedElts, DAG, Depth);
15767}
15768
15770 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15772
15773 // Set the high bits to zero based on the maximum allowed scratch size per
15774 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15775 // calculation won't overflow, so assume the sign bit is never set.
15776 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15777}
15778
15780 KnownBits &Known, unsigned Dim) {
15781 unsigned MaxValue =
15782 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15783 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15784}
15785
15787 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15788 const MachineRegisterInfo &MRI, unsigned Depth) const {
15789 const MachineInstr *MI = MRI.getVRegDef(R);
15790 switch (MI->getOpcode()) {
15791 case AMDGPU::G_INTRINSIC:
15792 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15793 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15794 case Intrinsic::amdgcn_workitem_id_x:
15795 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15796 break;
15797 case Intrinsic::amdgcn_workitem_id_y:
15798 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15799 break;
15800 case Intrinsic::amdgcn_workitem_id_z:
15801 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15802 break;
15803 case Intrinsic::amdgcn_mbcnt_lo:
15804 case Intrinsic::amdgcn_mbcnt_hi: {
15805 // These return at most the wavefront size - 1.
15806 unsigned Size = MRI.getType(R).getSizeInBits();
15807 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15808 break;
15809 }
15810 case Intrinsic::amdgcn_groupstaticsize: {
15811 // We can report everything over the maximum size as 0. We can't report
15812 // based on the actual size because we don't know if it's accurate or not
15813 // at any given point.
15814 Known.Zero.setHighBits(
15815 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15816 break;
15817 }
15818 }
15819 break;
15820 }
15821 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15822 Known.Zero.setHighBits(24);
15823 break;
15824 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15825 Known.Zero.setHighBits(16);
15826 break;
15827 case AMDGPU::G_AMDGPU_SMED3:
15828 case AMDGPU::G_AMDGPU_UMED3: {
15829 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15830
15831 KnownBits Known2;
15832 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15833 if (Known2.isUnknown())
15834 break;
15835
15836 KnownBits Known1;
15837 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15838 if (Known1.isUnknown())
15839 break;
15840
15841 KnownBits Known0;
15842 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15843 if (Known0.isUnknown())
15844 break;
15845
15846 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15847 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15848 Known.One = Known0.One & Known1.One & Known2.One;
15849 break;
15850 }
15851 }
15852}
15853
15856 unsigned Depth) const {
15857 const MachineInstr *MI = MRI.getVRegDef(R);
15858 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15859 // FIXME: Can this move to generic code? What about the case where the call
15860 // site specifies a lower alignment?
15861 Intrinsic::ID IID = GI->getIntrinsicID();
15863 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15864 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15865 return *RetAlign;
15866 }
15867 return Align(1);
15868}
15869
15872 const Align CacheLineAlign = Align(64);
15873
15874 // Pre-GFX10 target did not benefit from loop alignment
15875 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15876 getSubtarget()->hasInstFwdPrefetchBug())
15877 return PrefAlign;
15878
15879 // On GFX10 I$ is 4 x 64 bytes cache lines.
15880 // By default prefetcher keeps one cache line behind and reads two ahead.
15881 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15882 // behind and one ahead.
15883 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15884 // If loop fits 64 bytes it always spans no more than two cache lines and
15885 // does not need an alignment.
15886 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15887 // Else if loop is less or equal 192 bytes we need two lines behind.
15888
15890 const MachineBasicBlock *Header = ML->getHeader();
15891 if (Header->getAlignment() != PrefAlign)
15892 return Header->getAlignment(); // Already processed.
15893
15894 unsigned LoopSize = 0;
15895 for (const MachineBasicBlock *MBB : ML->blocks()) {
15896 // If inner loop block is aligned assume in average half of the alignment
15897 // size to be added as nops.
15898 if (MBB != Header)
15899 LoopSize += MBB->getAlignment().value() / 2;
15900
15901 for (const MachineInstr &MI : *MBB) {
15902 LoopSize += TII->getInstSizeInBytes(MI);
15903 if (LoopSize > 192)
15904 return PrefAlign;
15905 }
15906 }
15907
15908 if (LoopSize <= 64)
15909 return PrefAlign;
15910
15911 if (LoopSize <= 128)
15912 return CacheLineAlign;
15913
15914 // If any of parent loops is surrounded by prefetch instructions do not
15915 // insert new for inner loop, which would reset parent's settings.
15916 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15917 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15918 auto I = Exit->getFirstNonDebugInstr();
15919 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15920 return CacheLineAlign;
15921 }
15922 }
15923
15924 MachineBasicBlock *Pre = ML->getLoopPreheader();
15925 MachineBasicBlock *Exit = ML->getExitBlock();
15926
15927 if (Pre && Exit) {
15928 auto PreTerm = Pre->getFirstTerminator();
15929 if (PreTerm == Pre->begin() ||
15930 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15931 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15932 .addImm(1); // prefetch 2 lines behind PC
15933
15934 auto ExitHead = Exit->getFirstNonDebugInstr();
15935 if (ExitHead == Exit->end() ||
15936 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15937 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15938 .addImm(2); // prefetch 1 line behind PC
15939 }
15940
15941 return CacheLineAlign;
15942}
15943
15945static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15946 assert(N->getOpcode() == ISD::CopyFromReg);
15947 do {
15948 // Follow the chain until we find an INLINEASM node.
15949 N = N->getOperand(0).getNode();
15950 if (N->getOpcode() == ISD::INLINEASM ||
15951 N->getOpcode() == ISD::INLINEASM_BR)
15952 return true;
15953 } while (N->getOpcode() == ISD::CopyFromReg);
15954 return false;
15955}
15956
15959 UniformityInfo *UA) const {
15960 switch (N->getOpcode()) {
15961 case ISD::CopyFromReg: {
15962 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15963 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15964 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15965 Register Reg = R->getReg();
15966
15967 // FIXME: Why does this need to consider isLiveIn?
15968 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15969 return !TRI->isSGPRReg(MRI, Reg);
15970
15971 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15972 return UA->isDivergent(V);
15973
15975 return !TRI->isSGPRReg(MRI, Reg);
15976 }
15977 case ISD::LOAD: {
15978 const LoadSDNode *L = cast<LoadSDNode>(N);
15979 unsigned AS = L->getAddressSpace();
15980 // A flat load may access private memory.
15982 }
15983 case ISD::CALLSEQ_END:
15984 return true;
15986 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15988 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16007 // Target-specific read-modify-write atomics are sources of divergence.
16008 return true;
16009 default:
16010 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16011 // Generic read-modify-write atomics are sources of divergence.
16012 return A->readMem() && A->writeMem();
16013 }
16014 return false;
16015 }
16016}
16017
16019 EVT VT) const {
16020 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16021 case MVT::f32:
16023 case MVT::f64:
16024 case MVT::f16:
16026 default:
16027 return false;
16028 }
16029}
16030
16032 LLT Ty, const MachineFunction &MF) const {
16033 switch (Ty.getScalarSizeInBits()) {
16034 case 32:
16035 return !denormalModeIsFlushAllF32(MF);
16036 case 64:
16037 case 16:
16038 return !denormalModeIsFlushAllF64F16(MF);
16039 default:
16040 return false;
16041 }
16042}
16043
16045 const SelectionDAG &DAG,
16046 bool SNaN,
16047 unsigned Depth) const {
16048 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16049 const MachineFunction &MF = DAG.getMachineFunction();
16051
16052 if (Info->getMode().DX10Clamp)
16053 return true; // Clamped to 0.
16054 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16055 }
16056
16058 SNaN, Depth);
16059}
16060
16061#if 0
16062// FIXME: This should be checked before unsafe fp atomics are enabled
16063// Global FP atomic instructions have a hardcoded FP mode and do not support
16064// FP32 denormals, and only support v2f16 denormals.
16065static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16067 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16068 if (&Flt == &APFloat::IEEEsingle())
16069 return DenormMode == DenormalMode::getPreserveSign();
16070 return DenormMode == DenormalMode::getIEEE();
16071}
16072#endif
16073
16074// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16075// floating point atomic instructions. May generate more efficient code,
16076// but may not respect rounding and denormal modes, and may give incorrect
16077// results for certain memory destinations.
16079 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16080 "true";
16081}
16082
16084 LLVMContext &Ctx = RMW->getContext();
16086 Ctx.getSyncScopeNames(SSNs);
16087 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16088 ? "system"
16089 : SSNs[RMW->getSyncScopeID()];
16090
16091 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16092 << "Hardware instruction generated for atomic "
16093 << RMW->getOperationName(RMW->getOperation())
16094 << " operation at memory scope " << MemScope;
16095}
16096
16097static bool isHalf2OrBFloat2(Type *Ty) {
16098 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16099 Type *EltTy = VT->getElementType();
16100 return VT->getNumElements() == 2 &&
16101 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16102 }
16103
16104 return false;
16105}
16106
16107static bool isHalf2(Type *Ty) {
16109 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16110}
16111
16112static bool isBFloat2(Type *Ty) {
16114 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16115}
16116
16119 unsigned AS = RMW->getPointerAddressSpace();
16120 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16122
16123 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16125 ORE.emit([=]() {
16126 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16127 });
16128 return Kind;
16129 };
16130
16131 auto SSID = RMW->getSyncScopeID();
16132 bool HasSystemScope =
16133 SSID == SyncScope::System ||
16134 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16135
16136 switch (RMW->getOperation()) {
16137 case AtomicRMWInst::Sub:
16138 case AtomicRMWInst::Or:
16139 case AtomicRMWInst::Xor: {
16140 // Atomic sub/or/xor do not work over PCI express, but atomic add
16141 // does. InstCombine transforms these with 0 to or, so undo that.
16142 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16143 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16144 ConstVal && ConstVal->isNullValue())
16146 }
16147
16148 break;
16149 }
16150 case AtomicRMWInst::FAdd: {
16151 Type *Ty = RMW->getType();
16152
16153 // TODO: Handle REGION_ADDRESS
16154 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16155 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16156 // is fixed to round-to-nearest-even.
16157 //
16158 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16159 // round-to-nearest-even.
16160 //
16161 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16162 // suggests it is OK if the floating-point mode may not match the calling
16163 // thread.
16164 if (Ty->isFloatTy()) {
16167 }
16168
16169 if (Ty->isDoubleTy()) {
16170 // Ignores denormal mode, but we don't consider flushing mandatory.
16173 }
16174
16175 if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16177
16179 }
16180
16184
16185 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16187
16188 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16189 // gfx940, gfx12
16190 // FIXME: Needs to account for no fine-grained memory
16191 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16193 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16194 // gfx90a, gfx940, gfx12
16195 // FIXME: Needs to account for no fine-grained memory
16196 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16198
16199 // gfx940, gfx12
16200 // FIXME: Needs to account for no fine-grained memory
16201 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16203 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16204 // gfx90a, gfx940, gfx12
16205 // FIXME: Needs to account for no fine-grained memory
16206 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16208
16209 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16210 // buffer. gfx12 does have the buffer version.
16211 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16213 }
16214
16217
16218 // Always expand system scope fp atomics.
16219 if (HasSystemScope)
16221
16222 // global and flat atomic fadd f64: gfx90a, gfx940.
16223 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16224 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16225
16226 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16227 if (Ty->isFloatTy()) {
16228 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16229 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16230 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16231 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16232 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16233 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16234 } else {
16235 // gfx908
16236 if (RMW->use_empty() &&
16238 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16239 }
16240 }
16241
16242 // flat atomic fadd f32: gfx940, gfx11+.
16243 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16244 if (Subtarget->hasFlatAtomicFaddF32Inst())
16245 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16246
16247 // If it is in flat address space, and the type is float, we will try to
16248 // expand it, if the target supports global and lds atomic fadd. The
16249 // reason we need that is, in the expansion, we emit the check of address
16250 // space. If it is in global address space, we emit the global atomic
16251 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16252 if (Subtarget->hasLDSFPAtomicAddF32()) {
16253 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16255 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16257 }
16258 }
16259
16261 }
16263 case AtomicRMWInst::FMax: {
16264 Type *Ty = RMW->getType();
16265
16266 // LDS float and double fmin/fmax were always supported.
16267 if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16269
16272
16273 // Always expand system scope fp atomics.
16274 if (HasSystemScope)
16276
16277 // For flat and global cases:
16278 // float, double in gfx7. Manual claims denormal support.
16279 // Removed in gfx8.
16280 // float, double restored in gfx10.
16281 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16282 //
16283 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16284 // f32.
16285 //
16286 // FIXME: Check scope and fine grained memory
16287 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16288 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16289 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16290 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16291 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16292 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16294 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16295 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16296 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16297 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16298 }
16299
16301 }
16302 case AtomicRMWInst::Min:
16303 case AtomicRMWInst::Max:
16305 case AtomicRMWInst::UMax: {
16308 // Always expand system scope min/max atomics.
16309 if (HasSystemScope)
16311 }
16312 break;
16313 }
16314 default:
16315 break;
16316 }
16317
16319}
16320
16327
16334
16341
16342const TargetRegisterClass *
16343SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16345 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16346 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16347 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16348 : &AMDGPU::SReg_32RegClass;
16349 if (!TRI->isSGPRClass(RC) && !isDivergent)
16350 return TRI->getEquivalentSGPRClass(RC);
16351 if (TRI->isSGPRClass(RC) && isDivergent)
16352 return TRI->getEquivalentVGPRClass(RC);
16353
16354 return RC;
16355}
16356
16357// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16358// uniform values (as produced by the mask results of control flow intrinsics)
16359// used outside of divergent blocks. The phi users need to also be treated as
16360// always uniform.
16361//
16362// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16363static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16364 unsigned WaveSize) {
16365 // FIXME: We assume we never cast the mask results of a control flow
16366 // intrinsic.
16367 // Early exit if the type won't be consistent as a compile time hack.
16368 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16369 if (!IT || IT->getBitWidth() != WaveSize)
16370 return false;
16371
16372 if (!isa<Instruction>(V))
16373 return false;
16374 if (!Visited.insert(V).second)
16375 return false;
16376 bool Result = false;
16377 for (const auto *U : V->users()) {
16378 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16379 if (V == U->getOperand(1)) {
16380 switch (Intrinsic->getIntrinsicID()) {
16381 default:
16382 Result = false;
16383 break;
16384 case Intrinsic::amdgcn_if_break:
16385 case Intrinsic::amdgcn_if:
16386 case Intrinsic::amdgcn_else:
16387 Result = true;
16388 break;
16389 }
16390 }
16391 if (V == U->getOperand(0)) {
16392 switch (Intrinsic->getIntrinsicID()) {
16393 default:
16394 Result = false;
16395 break;
16396 case Intrinsic::amdgcn_end_cf:
16397 case Intrinsic::amdgcn_loop:
16398 Result = true;
16399 break;
16400 }
16401 }
16402 } else {
16403 Result = hasCFUser(U, Visited, WaveSize);
16404 }
16405 if (Result)
16406 break;
16407 }
16408 return Result;
16409}
16410
16412 const Value *V) const {
16413 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16414 if (CI->isInlineAsm()) {
16415 // FIXME: This cannot give a correct answer. This should only trigger in
16416 // the case where inline asm returns mixed SGPR and VGPR results, used
16417 // outside the defining block. We don't have a specific result to
16418 // consider, so this assumes if any value is SGPR, the overall register
16419 // also needs to be SGPR.
16420 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16422 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16423 for (auto &TC : TargetConstraints) {
16424 if (TC.Type == InlineAsm::isOutput) {
16427 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16428 if (RC && SIRI->isSGPRClass(RC))
16429 return true;
16430 }
16431 }
16432 }
16433 }
16435 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16436}
16437
16440 for (; I != E; ++I) {
16441 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16442 if (getBasePtrIndex(M) == I.getOperandNo())
16443 return true;
16444 }
16445 }
16446 return false;
16447}
16448
16450 SDValue N1) const {
16451 if (!N0.hasOneUse())
16452 return false;
16453 // Take care of the opportunity to keep N0 uniform
16454 if (N0->isDivergent() || !N1->isDivergent())
16455 return true;
16456 // Check if we have a good chance to form the memory access pattern with the
16457 // base and offset
16458 return (DAG.isBaseWithConstantOffset(N0) &&
16459 hasMemSDNodeUser(*N0->use_begin()));
16460}
16461
16463 Register N0, Register N1) const {
16464 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16465}
16466
16469 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16471 if (I.getMetadata("amdgpu.noclobber"))
16472 Flags |= MONoClobber;
16473 if (I.getMetadata("amdgpu.last.use"))
16474 Flags |= MOLastUse;
16475 return Flags;
16476}
16477
16479 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16480 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16481 if (User->getOpcode() != ISD::CopyToReg)
16482 return false;
16483 if (!Def->isMachineOpcode())
16484 return false;
16486 if (!MDef)
16487 return false;
16488
16489 unsigned ResNo = User->getOperand(Op).getResNo();
16490 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16491 return false;
16492 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16493 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16494 PhysReg = AMDGPU::SCC;
16495 const TargetRegisterClass *RC =
16496 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16497 Cost = RC->getCopyCost();
16498 return true;
16499 }
16500 return false;
16501}
16502
16505
16508 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16509 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16510 "this cannot be replaced with add");
16512 return;
16513 }
16514
16515 assert(Subtarget->hasAtomicFaddInsts() &&
16516 "target should have atomic fadd instructions");
16517 assert(AI->getType()->isFloatTy() &&
16519 "generic atomicrmw expansion only supports FP32 operand in flat "
16520 "address space");
16521 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16522
16523 // Given: atomicrmw fadd ptr %addr, float %val ordering
16524 //
16525 // With this expansion we produce the following code:
16526 // [...]
16527 // br label %atomicrmw.check.shared
16528 //
16529 // atomicrmw.check.shared:
16530 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16531 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16532 //
16533 // atomicrmw.shared:
16534 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16535 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16536 // float %val ordering
16537 // br label %atomicrmw.phi
16538 //
16539 // atomicrmw.check.private:
16540 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16541 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16542 //
16543 // atomicrmw.private:
16544 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16545 // %loaded.private = load float, ptr addrspace(5) %cast.private
16546 // %val.new = fadd float %loaded.private, %val
16547 // store float %val.new, ptr addrspace(5) %cast.private
16548 // br label %atomicrmw.phi
16549 //
16550 // atomicrmw.global:
16551 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16552 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16553 // float %val ordering
16554 // br label %atomicrmw.phi
16555 //
16556 // atomicrmw.phi:
16557 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16558 // [ %loaded.private, %atomicrmw.private ],
16559 // [ %loaded.global, %atomicrmw.global ]
16560 // br label %atomicrmw.end
16561 //
16562 // atomicrmw.end:
16563 // [...]
16564
16565 IRBuilder<> Builder(AI);
16566 LLVMContext &Ctx = Builder.getContext();
16567
16568 BasicBlock *BB = Builder.GetInsertBlock();
16569 Function *F = BB->getParent();
16570 BasicBlock *ExitBB =
16571 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16572 BasicBlock *CheckSharedBB =
16573 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16574 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16575 BasicBlock *CheckPrivateBB =
16576 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16577 BasicBlock *PrivateBB =
16578 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16579 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16580 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16581
16582 Value *Val = AI->getValOperand();
16583 Type *ValTy = Val->getType();
16584 Value *Addr = AI->getPointerOperand();
16585
16586 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16587 Value *Val) -> Value * {
16588 AtomicRMWInst *OldVal =
16589 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16590 AI->getOrdering(), AI->getSyncScopeID());
16592 AI->getAllMetadata(MDs);
16593 for (auto &P : MDs)
16594 OldVal->setMetadata(P.first, P.second);
16595 return OldVal;
16596 };
16597
16598 std::prev(BB->end())->eraseFromParent();
16599 Builder.SetInsertPoint(BB);
16600 Builder.CreateBr(CheckSharedBB);
16601
16602 Builder.SetInsertPoint(CheckSharedBB);
16603 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16604 {Addr}, nullptr, "is.shared");
16605 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16606
16607 Builder.SetInsertPoint(SharedBB);
16608 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16610 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16611 Builder.CreateBr(PhiBB);
16612
16613 Builder.SetInsertPoint(CheckPrivateBB);
16614 CallInst *IsPrivate = Builder.CreateIntrinsic(
16615 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16616 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16617
16618 Builder.SetInsertPoint(PrivateBB);
16619 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16621 Value *LoadedPrivate =
16622 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16623 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16624 Builder.CreateStore(NewVal, CastToPrivate);
16625 Builder.CreateBr(PhiBB);
16626
16627 Builder.SetInsertPoint(GlobalBB);
16628 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16630 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16631 Builder.CreateBr(PhiBB);
16632
16633 Builder.SetInsertPoint(PhiBB);
16634 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16635 Loaded->addIncoming(LoadedShared, SharedBB);
16636 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16637 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16638 Builder.CreateBr(ExitBB);
16639
16640 AI->replaceAllUsesWith(Loaded);
16641 AI->eraseFromParent();
16642}
16643
16644LoadInst *
16646 IRBuilder<> Builder(AI);
16647 auto Order = AI->getOrdering();
16648
16649 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16650 // must be flushed if the atomic ordering had a release semantics. This is
16651 // not necessary a fence, a release fence just coincides to do that flush.
16652 // Avoid replacing of an atomicrmw with a release semantics.
16653 if (isReleaseOrStronger(Order))
16654 return nullptr;
16655
16656 LoadInst *LI = Builder.CreateAlignedLoad(
16657 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16658 LI->setAtomic(Order, AI->getSyncScopeID());
16659 LI->copyMetadata(*AI);
16660 LI->takeName(AI);
16661 AI->replaceAllUsesWith(LI);
16662 AI->eraseFromParent();
16663 return LI;
16664}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
RelocType Type
Definition COFFYAML.cpp:391
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:203
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
#define LLVM_DEBUG(X)
Definition Debug.h:101
uint64_t Align
uint64_t Addr
uint64_t Size
bool End
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
unsigned Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1171
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1168
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isHalf2OrBFloat2(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static bool isHalf2(Type *Ty)
bool unsafeFPAtomicsDisabled(Function *F)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool isBFloat2(Type *Ty)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getWavefrontSizeLog2() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1026
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5317
bool isNegative() const
Definition APFloat.h:1354
APInt bitcastToAPInt() const
Definition APFloat.h:1260
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1044
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1004
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:988
bool isInfinity() const
Definition APFloat.h:1351
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:238
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:446
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1598
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:276
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1217
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1201
This class represents an incoming formal argument to a Function.
Definition Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
Definition ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ Xor
*p = old ^ v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a function, its return value, and its parameters.
Definition Attributes.h:468
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
Definition BasicBlock.h:61
iterator end()
Definition BasicBlock.h:451
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:209
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:202
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:757
@ ICMP_NE
not equal
Definition InstrTypes.h:779
bool isSigned() const
bool isFPPredicate() const
Definition InstrTypes.h:864
bool isIntPredicate() const
Definition InstrTypes.h:865
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:206
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.h:504
A debug info location.
Definition DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:207
iterator_range< arg_iterator > args()
Definition Function.h:855
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:274
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
bool hasPrefetch() const
bool hasD16Images() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
bool hasBCNT(unsigned Size) const
bool hasMAIInsts() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
bool useDS128() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasUnalignedScratchAccess() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasIntClamp() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasFFBL() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasMed3_16() const
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasBFI() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasFFBH() const
bool hasAtomicFaddInsts() const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasIEEEMinMax3() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasAddr64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasFractBug() const
bool hasGDS() const
bool hasBFE() const
bool hasPrivateSegmentBuffer() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2671
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
const Function * getFunction() const
Return the function this instruction belongs to.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
Metadata node.
Definition Metadata.h:1069
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:195
Root of the metadata hierarchy.
Definition Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
SDNode * getGluedNode() const
If this node has a glue operand, return the node to which the glue operand points.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_t size() const
Definition SmallVector.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:838
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:250
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:262
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:384
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:254
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:146
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:143
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:246
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:228
const fltSemantics & getFltSemantics() const
Definition Type.cpp:70
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:348
A Use represents the edge between a Value definition and its users.
Definition Use.h:43
Value * getOperand(unsigned i) const
Definition User.h:169
LLVM Value Representation.
Definition Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:534
bool use_empty() const
Definition Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1075
iterator_range< use_iterator > uses()
Definition Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:383
Type * getElementType() const
constexpr bool isZero() const
Definition TypeSize.h:156
const ParentTy * getParent() const
Definition ilist_node.h:32
self_iterator getIterator()
Definition ilist_node.h:132
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition AMDGPU.h:415
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition AMDGPU.h:422
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:778
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:243
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:751
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:44
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:573
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:742
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:501
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:246
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:811
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:205
@ GlobalAddress
Definition ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:818
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:716
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:236
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ CONVERGENCECTRL_GLUE
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:802
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:634
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:750
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:514
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:521
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:755
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:229
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:908
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:733
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:614
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:587
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:808
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:770
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:338
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:826
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:696
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:310
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:479
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:864
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:484
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:708
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:897
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:814
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:791
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:347
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:529
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:57
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
sandboxir::Value * getValue(llvm::Value *V) const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:329
@ Offset
Definition DWP.cpp:480
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:244
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:169
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2073
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:547
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:285
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:291
T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:81
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:193
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:159
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:235
ArrayRef(const T &OneElt) -> ArrayRef< T >
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1749
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:860
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:276
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:274
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:370
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:313
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:151
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:62
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:70
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:285
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:237
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
void setNoUnsignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs